From 1a6c978569bc501699d12330a2aef38e73c438ff Mon Sep 17 00:00:00 2001 From: Zhang Sheng Date: Wed, 13 May 2026 16:00:07 +0800 Subject: [PATCH 01/36] feat: add filename search to content and OCR search MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Added filename keyword search capability to both content and OCR text search 2. Implemented new API methods setFilenameKeyword() and filenameKeyword() in TextSearchOptionsAPI 3. Added --filename command line option for filename searches 4. Modified search validation to allow short queries when filename search is used 5. Implemented Lucene query building logic that combines filename and content searches with AND logic 6. Added filename search support to command line interface and API Log: Added filename search capability to file content and OCR text search Influence: 1. Test filename-only searches with --filename parameter 2. Test combined content+filename searches 3. Verify handling of short queries when filename search is used 4. Test command line help output for new --filename option 5. Verify API-level filename search functionality feat: 为内容搜索和OCR搜索添加文件名搜索功能 1. 为内容搜索和OCR文本搜索添加文件名关键词搜索功能 2. 在TextSearchOptionsAPI中实现setFilenameKeyword()和filenameKeyword()新 API方法 3. 添加用于文件名搜索的--filename命令行选项 4. 修改搜索验证逻辑以在使用文件名搜索时允许短查询 5. 实现将文件名和内容搜索与AND逻辑相结合的Lucene查询构建逻辑 6. 为命令行界面和API添加文件名搜索支持 Log: 为文件内容和OCR文本搜索添加文件名搜索功能 Influence: 1. 使用--filename参数测试仅文件名搜索 2. 测试组合的内容+文件名搜索 3. 验证使用文件名搜索时对短查询的处理 4. 测试新--filename选项的命令行帮助输出 5. 验证API级别的文件名搜索功能 --- include/dfm-search/dfm-search/textsearchapi.h | 19 +++++++++++ .../dfm-search-client/cli_options.cpp | 8 +++++ .../dfm-search-client/cli_options.h | 4 +++ src/dfm-search/dfm-search-client/main.cpp | 6 ++++ .../contentsearch/contentsearchengine.cpp | 3 +- .../contentstrategies/indexedstrategy.cpp | 32 +++++++++++++++++++ .../ocrtextsearch/ocrtextsearchengine.cpp | 6 +++- .../ocrtextstrategies/indexedstrategy.cpp | 32 +++++++++++++++++++ .../textsearch/textsearchapi.cpp | 10 ++++++ 9 files changed, 118 insertions(+), 2 deletions(-) diff --git a/include/dfm-search/dfm-search/textsearchapi.h b/include/dfm-search/dfm-search/textsearchapi.h index 3f52e8be..00864d7d 100644 --- a/include/dfm-search/dfm-search/textsearchapi.h +++ b/include/dfm-search/dfm-search/textsearchapi.h @@ -75,6 +75,25 @@ class TextSearchOptionsAPI */ bool isFullTextRetrievalEnabled() const; + // ==================== Filename Search ==================== + + /** + * @brief Sets a keyword to search on the filename field. + * + * When set, the search will also match against the indexed filename field + * in addition to (or instead of) the content field. If both a content keyword + * and a filename keyword are provided, results must match both (AND logic). + * + * @param keyword The filename keyword to search for. + */ + void setFilenameKeyword(const QString &keyword); + + /** + * @brief Gets the filename keyword for search. + * @return The filename keyword, or empty string if not set. + */ + QString filenameKeyword() const; + protected: SearchOptions &m_options; }; diff --git a/src/dfm-search/dfm-search-client/cli_options.cpp b/src/dfm-search/dfm-search-client/cli_options.cpp index 63986e61..861edf00 100644 --- a/src/dfm-search/dfm-search-client/cli_options.cpp +++ b/src/dfm-search/dfm-search-client/cli_options.cpp @@ -25,6 +25,7 @@ CliOptions::CliOptions() m_fileExtensionsOption(QStringList() << "file-extensions", "Filter by file extensions, comma separated", "extensions"), m_maxResultsOption(QStringList() << "max-results", "Maximum number of results (0 for unlimited)", "number", "0"), m_maxPreviewOption(QStringList() << "max-preview", "Max content preview length", "length", "200"), + m_filenameOption(QStringList() << "filename", "Search by filename in content/ocr index", "keyword"), m_wildcardOption(QStringList() << "wildcard", "Enable wildcard search with * and ? patterns"), m_jsonOption(QStringList() << "json" << "j", @@ -64,6 +65,7 @@ void CliOptions::setupOptions() m_parser.addOption(m_fileExtensionsOption); m_parser.addOption(m_maxResultsOption); m_parser.addOption(m_maxPreviewOption); + m_parser.addOption(m_filenameOption); m_parser.addOption(m_wildcardOption); m_parser.addOption(m_jsonOption); m_parser.addOption(m_verboseOption); @@ -108,6 +110,7 @@ void CliOptions::printHelp() const std::cout << " --file-extensions= Filter by file extensions, comma separated" << std::endl; std::cout << " --max-results= Maximum number of results (0 for unlimited)" << std::endl; std::cout << " --max-preview= Max content preview length (for content/ocr search)" << std::endl; + std::cout << " --filename= Search by filename in content/ocr index" << std::endl; std::cout << std::endl; std::cout << "Time Range Filter Options:" << std::endl; std::cout << " --time-field= Time field to filter (birth=creation, modify=modification)" << std::endl; @@ -220,6 +223,11 @@ bool CliOptions::parse(QCoreApplication &app, SearchCliConfig &config) #endif } + // 解析文件名搜索选项(仅对 content/ocr 搜索有效) + if (m_parser.isSet(m_filenameOption)) { + config.filenameKeyword = m_parser.value(m_filenameOption); + } + // 解析数值选项 if (m_parser.isSet(m_maxResultsOption)) { bool ok; diff --git a/src/dfm-search/dfm-search-client/cli_options.h b/src/dfm-search/dfm-search-client/cli_options.h index 10bdbf15..a96d782d 100644 --- a/src/dfm-search/dfm-search-client/cli_options.h +++ b/src/dfm-search/dfm-search-client/cli_options.h @@ -44,6 +44,9 @@ struct SearchCliConfig int maxResults = 0; // 0 表示不限制 int maxPreviewLength = 200; + // 文件名搜索选项 + QString filenameKeyword; + // 时间范围过滤 bool hasTimeFilter = false; DFMSEARCH::TimeRangeFilter timeFilter; @@ -93,6 +96,7 @@ class CliOptions QCommandLineOption m_fileExtensionsOption; QCommandLineOption m_maxResultsOption; QCommandLineOption m_maxPreviewOption; + QCommandLineOption m_filenameOption; QCommandLineOption m_wildcardOption; QCommandLineOption m_jsonOption; QCommandLineOption m_verboseOption; diff --git a/src/dfm-search/dfm-search-client/main.cpp b/src/dfm-search/dfm-search-client/main.cpp index 26ed2457..9197773f 100644 --- a/src/dfm-search/dfm-search-client/main.cpp +++ b/src/dfm-search/dfm-search-client/main.cpp @@ -53,12 +53,18 @@ static void configureSearchOptions(SearchOptions &options, const SearchCliConfig contentOptions.setFullTextRetrievalEnabled(true); contentOptions.setSearchResultHighlightEnabled(true); contentOptions.setFilenameContentMixedAndSearchEnabled(true); + if (!config.filenameKeyword.isEmpty()) { + contentOptions.setFilenameKeyword(config.filenameKeyword); + } } else if (config.searchType == SearchType::Ocr) { OcrTextOptionsAPI ocrTextOptions(options); ocrTextOptions.setMaxPreviewLength(config.maxPreviewLength); ocrTextOptions.setFullTextRetrievalEnabled(true); ocrTextOptions.setSearchResultHighlightEnabled(true); ocrTextOptions.setFilenameOcrContentMixedAndSearchEnabled(true); + if (!config.filenameKeyword.isEmpty()) { + ocrTextOptions.setFilenameKeyword(config.filenameKeyword); + } } // 应用时间范围过滤 diff --git a/src/dfm-search/dfm-search-lib/contentsearch/contentsearchengine.cpp b/src/dfm-search/dfm-search-lib/contentsearch/contentsearchengine.cpp index 882a3155..bca9df31 100644 --- a/src/dfm-search/dfm-search-lib/contentsearch/contentsearchengine.cpp +++ b/src/dfm-search/dfm-search-lib/contentsearch/contentsearchengine.cpp @@ -42,7 +42,8 @@ SearchError ContentSearchEngine::validateSearchConditions() } if (m_currentQuery.type() == SearchQuery::Type::Simple - && m_currentQuery.keyword().toUtf8().size() < Global::kMinContentSearchKeywordLength) { + && m_currentQuery.keyword().toUtf8().size() < Global::kMinContentSearchKeywordLength + && api.filenameKeyword().isEmpty()) { return SearchError(ContentSearchErrorCode::KeywordTooShort); } diff --git a/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.cpp b/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.cpp index bc498aa6..1efcc582 100644 --- a/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.cpp +++ b/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.cpp @@ -143,6 +143,38 @@ Lucene::QueryPtr ContentIndexedStrategy::buildLuceneQuery(const SearchQuery &que } } + // Add filename keyword query + QString filenameKw = optAPI.filenameKeyword(); + if (!filenameKw.isEmpty()) { + Lucene::QueryParserPtr filenameParser = newLucene( + Lucene::LuceneVersion::LUCENE_CURRENT, + LuceneFieldNames::Content::kFilename, + analyzer); + Lucene::QueryPtr filenameQuery = filenameParser->parse( + LuceneQueryUtils::processQueryString(filenameKw, false)); + + if (filenameQuery) { + // Check if content keywords are effectively empty + bool noContentKeywords = (query.type() == SearchQuery::Type::Simple) + ? query.keyword().isEmpty() + : (query.subQueries().isEmpty() + || std::all_of(query.subQueries().cbegin(), query.subQueries().cend(), + [](const auto &sq) { return sq.keyword().isEmpty(); })); + + if (noContentKeywords) { + // Filename-only search: use filename query directly + mainQuery = filenameQuery; + } else if (mainQuery) { + // Both content and filename: AND combination + BooleanQueryPtr finalQuery = newLucene(); + finalQuery->add(mainQuery, BooleanClause::MUST); + finalQuery->add(filenameQuery, BooleanClause::MUST); + mainQuery = finalQuery; + } + m_keywords.append(filenameKw); + } + } + return mainQuery; } catch (const Lucene::LuceneException &e) { diff --git a/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextsearchengine.cpp b/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextsearchengine.cpp index ed9172bb..476bd8ce 100644 --- a/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextsearchengine.cpp +++ b/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextsearchengine.cpp @@ -3,6 +3,8 @@ // SPDX-License-Identifier: GPL-3.0-or-later #include "ocrtextsearchengine.h" +#include + #include "ocrtextstrategies/indexedstrategy.h" DFM_SEARCH_BEGIN_NS @@ -40,8 +42,10 @@ SearchError OcrTextSearchEngine::validateSearchConditions() return SearchError(OcrTextSearchErrorCode::WildcardNotSupported); } + OcrTextOptionsAPI optAPI(m_options); if (m_currentQuery.type() == SearchQuery::Type::Simple - && m_currentQuery.keyword().toUtf8().size() < Global::kMinContentSearchKeywordLength) { + && m_currentQuery.keyword().toUtf8().size() < Global::kMinContentSearchKeywordLength + && optAPI.filenameKeyword().isEmpty()) { return SearchError(OcrTextSearchErrorCode::KeywordTooShort); } diff --git a/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.cpp b/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.cpp index e561b2a6..2d146e94 100644 --- a/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.cpp +++ b/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.cpp @@ -140,6 +140,38 @@ Lucene::QueryPtr OcrTextIndexedStrategy::buildLuceneQuery(const SearchQuery &que } } + // Add filename keyword query + QString filenameKw = optAPI.filenameKeyword(); + if (!filenameKw.isEmpty()) { + Lucene::QueryParserPtr filenameParser = newLucene( + Lucene::LuceneVersion::LUCENE_CURRENT, + LuceneFieldNames::OcrText::kFilename, + analyzer); + Lucene::QueryPtr filenameQuery = filenameParser->parse( + LuceneQueryUtils::processQueryString(filenameKw, false)); + + if (filenameQuery) { + // Check if content keywords are effectively empty + bool noContentKeywords = (query.type() == SearchQuery::Type::Simple) + ? query.keyword().isEmpty() + : (query.subQueries().isEmpty() + || std::all_of(query.subQueries().cbegin(), query.subQueries().cend(), + [](const auto &sq) { return sq.keyword().isEmpty(); })); + + if (noContentKeywords) { + // Filename-only search: use filename query directly + mainQuery = filenameQuery; + } else if (mainQuery) { + // Both content and filename: AND combination + BooleanQueryPtr finalQuery = newLucene(); + finalQuery->add(mainQuery, BooleanClause::MUST); + finalQuery->add(filenameQuery, BooleanClause::MUST); + mainQuery = finalQuery; + } + m_keywords.append(filenameKw); + } + } + return mainQuery; } catch (const Lucene::LuceneException &e) { diff --git a/src/dfm-search/dfm-search-lib/textsearch/textsearchapi.cpp b/src/dfm-search/dfm-search-lib/textsearch/textsearchapi.cpp index e2182d9f..fe3ed78d 100644 --- a/src/dfm-search/dfm-search-lib/textsearch/textsearchapi.cpp +++ b/src/dfm-search/dfm-search-lib/textsearch/textsearchapi.cpp @@ -45,6 +45,16 @@ bool TextSearchOptionsAPI::isFullTextRetrievalEnabled() const return m_options.customOption("fullTextRetrieval").toBool(); } +void TextSearchOptionsAPI::setFilenameKeyword(const QString &keyword) +{ + m_options.setCustomOption("filenameKeyword", keyword); +} + +QString TextSearchOptionsAPI::filenameKeyword() const +{ + return m_options.customOption("filenameKeyword").toString(); +} + // ==================== TextSearchResultAPI ==================== TextSearchResultAPI::TextSearchResultAPI(SearchResult &result) From 071ae336f49dd0098c7b2d3f4a2f766ef24bd1b4 Mon Sep 17 00:00:00 2001 From: Zhang Sheng Date: Thu, 14 May 2026 13:54:37 +0800 Subject: [PATCH 02/36] feat: add natural language semantic search MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit introduces a comprehensive natural language processing (NLP) based semantic search system for the file manager. Key changes include: 1. Added SemanticRuleEngine to load and process regex-based rules from JSON config files. Rules are organized into groups (time, filetype, keyword, noise) with priorities and metadata. 2. Implemented DimensionExtractor base class and concrete extractors: - TimeExtractor: parses relative/preset time ("today", "last week") and specific dates - FileTypeExtractor: maps terms to file extensions (e.g. "pdf", "document", "images") - KeywordExtractor: handles structured patterns and unconsumed text as keywords 3. Added IntentParser to coordinate extractors and produce ParsedIntent with: - Time constraints - File type filters - Search keywords - Consumed text spans 4. Implemented SemanticQueryBuilder to convert ParsedIntent into concrete: - SearchQuery objects (filename, content, OCR) - SearchOptions (time ranges, pinyin matching etc) 5. Added SemanticSearcher as main entry point with async/sync search APIs: - Parses natural language queries - Parallel searches across filename/content/OCR indexes - Deduplicates results - Timeout handling 6. Includes over 200 test cases covering Chinese NLP parsing for: - Time expressions - File type synonyms - Keyword patterns - Combined scenarios - Error cases 7. Adds rule files for Chinese language support: - Time expressions (relative/absolute) - File type mappings (precise/general) - Keyword patterns (contains/named/content) - Noise words (actions/polite/suffix) Log: Added semantic search with natural language support for Chinese Influence: 1. Test "find yesterday's pdf documents" with various time expressions 2. Verify file type mappings work for precise (pdf) and general (document) terms 3. Check keyword extraction from patterns and unconsumed text 4. Test combination searches with time+type+keywords 5. Validate results deduplication across search paths 6. Confirm timeout and cancellation works properly feat: 新增自然语言语义搜索功能 本次提交为文件管理器引入了全面的基于自然语言处理(NLP)的语义搜索系统。主 要变更包括: 1. 添加SemanticRuleEngine从JSON配置文件加载和处理基于正则表达式的规则。 规则按组(时间、文件类型、关键词、噪音词)组织,带有优先级和元数据。 2. 实现DimensionExtractor基类和具体提取器: - TimeExtractor: 解析相对/预设时间("今天"、"上周")和具体日期 - FileTypeExtractor: 将术语映射到文件扩展名(如"pdf"、"文档"、"图片") - KeywordExtractor: 处理结构化模式和未消费文本作为关键词 3. 添加IntentParser协调提取器并生成ParsedIntent,包含: - 时间约束条件 - 文件类型过滤器 - 搜索关键词 - 已消费文本范围 4. 实现SemanticQueryBuilder将ParsedIntent转换为具体: - SearchQuery对象(文件名、内容、OCR) - SearchOptions(时间范围、拼音匹配等) 5. 添加SemanticSearcher作为主要入口点,提供异步/同步搜索API: - 解析自然语言查询 - 并行搜索文件名/内容/OCR索引 - 结果去重 - 超时处理 6. 包含200多个测试用例,覆盖中文NLP解析: - 时间表达式 - 文件类型同义词 - 关键词模式 - 组合场景 - 错误情况 7. 添加中文语言规则文件: - 时间表达式(相对/绝对) - 文件类型映射(精确/通用) - 关键词模式(包含/名为/内容) - 噪音词(动作/礼貌/后缀) Log: 新增支持中文的自然语言语义搜索功能 Influence: 1. 测试"查找昨天的pdf文档"等不同时间表达式 2. 验证文件类型映射对精确(pdf)和通用(文档)术语有效 3. 检查从模式和未消费文本中提取关键词 4. 测试时间+类型+关键词的组合搜索 5. 确认跨搜索路径的结果去重 6. 验证超时和取消功能正常工作 --- autotests/dfm-search-tests/CMakeLists.txt | 6 + autotests/dfm-search-tests/main.cpp | 30 + .../dfm-search-tests/tst_chinese_nlp.cpp | 1035 +++++++++++++++++ .../dfm-search-tests/tst_semantic_search.cpp | 664 +++++++++++ debian/libdfm-search.install | 3 +- debian/libdfm6-search.install | 3 +- .../dfm-search/dimensionextractor.h | 32 + .../dfm-search/dfm-search/semantic_types.h | 86 ++ .../dfm-search/dfm-search/semanticsearcher.h | 131 +++ .../dfm-search-client/cli_options.cpp | 56 +- .../dfm-search-client/cli_options.h | 8 +- src/dfm-search/dfm-search-client/main.cpp | 34 +- .../dfm-search-lib/dfm-search.cmake | 11 + .../semantic/extractors/filetypeextractor.cpp | 63 + .../semantic/extractors/filetypeextractor.h | 29 + .../semantic/extractors/keywordextractor.cpp | 199 ++++ .../semantic/extractors/keywordextractor.h | 34 + .../semantic/extractors/timeextractor.cpp | 214 ++++ .../semantic/extractors/timeextractor.h | 52 + .../dfm-search-lib/semantic/intentparser.cpp | 52 + .../dfm-search-lib/semantic/intentparser.h | 59 + .../semantic/ruleconfigloader.cpp | 168 +++ .../semantic/ruleconfigloader.h | 101 ++ .../semantic/rules/zh_CN/filetype_rules.json | 154 +++ .../semantic/rules/zh_CN/keyword_rules.json | 51 + .../semantic/rules/zh_CN/noise_rules.json | 44 + .../semantic/rules/zh_CN/time_rules.json | 236 ++++ .../semantic/semanticquerybuilder.cpp | 187 +++ .../semantic/semanticquerybuilder.h | 52 + .../semantic/semanticruleengine.cpp | 285 +++++ .../semantic/semanticruleengine.h | 131 +++ .../semantic/semanticsearcher.cpp | 296 +++++ .../semantic/semanticsearcher_p.h | 58 + 33 files changed, 4550 insertions(+), 14 deletions(-) create mode 100644 autotests/dfm-search-tests/tst_chinese_nlp.cpp create mode 100644 autotests/dfm-search-tests/tst_semantic_search.cpp create mode 100644 include/dfm-search/dfm-search/dimensionextractor.h create mode 100644 include/dfm-search/dfm-search/semantic_types.h create mode 100644 include/dfm-search/dfm-search/semanticsearcher.h create mode 100644 src/dfm-search/dfm-search-lib/semantic/extractors/filetypeextractor.cpp create mode 100644 src/dfm-search/dfm-search-lib/semantic/extractors/filetypeextractor.h create mode 100644 src/dfm-search/dfm-search-lib/semantic/extractors/keywordextractor.cpp create mode 100644 src/dfm-search/dfm-search-lib/semantic/extractors/keywordextractor.h create mode 100644 src/dfm-search/dfm-search-lib/semantic/extractors/timeextractor.cpp create mode 100644 src/dfm-search/dfm-search-lib/semantic/extractors/timeextractor.h create mode 100644 src/dfm-search/dfm-search-lib/semantic/intentparser.cpp create mode 100644 src/dfm-search/dfm-search-lib/semantic/intentparser.h create mode 100644 src/dfm-search/dfm-search-lib/semantic/ruleconfigloader.cpp create mode 100644 src/dfm-search/dfm-search-lib/semantic/ruleconfigloader.h create mode 100644 src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/filetype_rules.json create mode 100644 src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/keyword_rules.json create mode 100644 src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/noise_rules.json create mode 100644 src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/time_rules.json create mode 100644 src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.cpp create mode 100644 src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.h create mode 100644 src/dfm-search/dfm-search-lib/semantic/semanticruleengine.cpp create mode 100644 src/dfm-search/dfm-search-lib/semantic/semanticruleengine.h create mode 100644 src/dfm-search/dfm-search-lib/semantic/semanticsearcher.cpp create mode 100644 src/dfm-search/dfm-search-lib/semantic/semanticsearcher_p.h diff --git a/autotests/dfm-search-tests/CMakeLists.txt b/autotests/dfm-search-tests/CMakeLists.txt index 50d22144..3aa002e0 100644 --- a/autotests/dfm-search-tests/CMakeLists.txt +++ b/autotests/dfm-search-tests/CMakeLists.txt @@ -30,5 +30,11 @@ target_include_directories(dfm-search-test ${CMAKE_SOURCE_DIR}/src/dfm-search/dfm-search-lib ) +# Pass source directory for locating rule files at runtime +target_compile_definitions(dfm-search-test + PRIVATE + TEST_SOURCE_DIR="${CMAKE_SOURCE_DIR}" +) + # Register the test with CTest add_test(NAME dfm-search-test COMMAND dfm-search-test) diff --git a/autotests/dfm-search-tests/main.cpp b/autotests/dfm-search-tests/main.cpp index 59a2f42e..96b6b4ca 100644 --- a/autotests/dfm-search-tests/main.cpp +++ b/autotests/dfm-search-tests/main.cpp @@ -9,6 +9,12 @@ extern QObject *create_tst_DfmSearch(); extern QObject *create_tst_SearchUtils(); extern QObject *create_tst_TimeRangeFilter(); extern QObject *create_tst_TextSearchAPI(); +extern QObject *create_tst_RuleEngine(); +extern QObject *create_tst_TimeExtraction(); +extern QObject *create_tst_FileTypeExtraction(); +extern QObject *create_tst_KeywordExtraction(); +extern QObject *create_tst_ParsedIntent(); +extern QObject *create_tst_ChineseNLP(); int main(int argc, char *argv[]) { @@ -31,5 +37,29 @@ int main(int argc, char *argv[]) result |= QTest::qExec(testObj4, argc, argv); delete testObj4; + QObject *testObj5 = create_tst_RuleEngine(); + result |= QTest::qExec(testObj5, argc, argv); + delete testObj5; + + QObject *testObj6 = create_tst_TimeExtraction(); + result |= QTest::qExec(testObj6, argc, argv); + delete testObj6; + + QObject *testObj7 = create_tst_FileTypeExtraction(); + result |= QTest::qExec(testObj7, argc, argv); + delete testObj7; + + QObject *testObj8 = create_tst_KeywordExtraction(); + result |= QTest::qExec(testObj8, argc, argv); + delete testObj8; + + QObject *testObj9 = create_tst_ParsedIntent(); + result |= QTest::qExec(testObj9, argc, argv); + delete testObj9; + + QObject *testObj10 = create_tst_ChineseNLP(); + result |= QTest::qExec(testObj10, argc, argv); + delete testObj10; + return result; } diff --git a/autotests/dfm-search-tests/tst_chinese_nlp.cpp b/autotests/dfm-search-tests/tst_chinese_nlp.cpp new file mode 100644 index 00000000..a3b79aa5 --- /dev/null +++ b/autotests/dfm-search-tests/tst_chinese_nlp.cpp @@ -0,0 +1,1035 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#include +#include +#include +#include +#include + +#include "semantic/intentparser.h" +#include "semantic/semanticruleengine.h" + +using namespace DFMSEARCH; + +static QString rulesDir() +{ + return QStringLiteral(TEST_SOURCE_DIR) + QStringLiteral( + "/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN"); +} + +// Helper: compare two QStringList as sets (order-independent) +static bool setEquals(const QStringList &a, const QStringList &b) +{ + return QSet(a.begin(), a.end()) == QSet(b.begin(), b.end()); +} + +class tst_ChineseNLP : public QObject +{ + Q_OBJECT + +private: + SemanticRuleEngine *m_engine = nullptr; + IntentParser *m_parser = nullptr; + +private Q_SLOTS: + void initTestCase(); + void cleanupTestCase(); + void init(); + + // Time preset tests + void timePreset_today(); + void timePreset_today_alt(); + void timePreset_yesterday(); + void timePreset_yesterday_variants(); + void timePreset_dayBeforeYesterday(); + void timePreset_thisWeek_variants(); + void timePreset_lastWeek_variants(); + void timePreset_thisMonth_variants(); + void timePreset_lastMonth_variants(); + void timePreset_thisYear_variants(); + void timePreset_lastYear_variants(); + + // Time custom tests + void timeCustom_year(); + void timeCustom_year_twoDigit(); + void timeCustom_month(); + void timeCustom_yearMonth(); + void timeCustom_yearMonth_separators(); + void timeCustom_date(); + void timeCustom_dateSpoken(); + void timeCustom_fullDate(); + void timeCustom_fullDate_separators(); + void timeCustom_yesterday_variants_all(); + void timeCustom_lastYear_extra(); + + // File type tests + void fileType_precise_pdf(); + void fileType_precise_word(); + void fileType_precise_excel(); + void fileType_precise_ppt(); + void fileType_category_image_variants(); + void fileType_category_video_variants(); + void fileType_category_audio_variants(); + void fileType_category_archive(); + void fileType_category_application(); + void fileType_category_designSource(); + void fileType_general_document(); + void fileType_general_spreadsheet(); + void fileType_general_presentation(); + + // Filetype all-synonyms tests (from requirements) + void fileType_document_general_allSynonyms(); + void fileType_spreadsheet_general_allSynonyms(); + void filetype_presentation_general_allSynonyms(); + void fileType_image_allSynonyms(); + void fileType_video_allSynonyms(); + void fileType_audio_allSynonyms(); + void fileType_archive_allSynonyms(); + void fileType_application_allSynonyms(); + void fileType_design_source_allSynonyms(); + + // Combined time+type tests + void combined_fullDateAndType(); + void combined_monthAndType(); + void combined_yearAndType(); + + // Keyword tests + void keyword_contains_single(); + void keyword_contains_multi(); + void keyword_named(); + void keyword_contentHas(); + void keyword_contentHas_multi(); + + // Noise + unconsumed text tests + void noise_action_words(); + void noise_polite_words(); + void noise_suffix_words(); + + // End-to-end combined tests + void combined_timeAndFiletype(); + void combined_timeAndFiletype_multi(); + void combined_timeAndFiletype_all(); + void combined_timeAndKeyword(); + void combined_filetypeAndKeyword(); + void combined_timeAndFiletypeAndKeyword(); + void combined_noiseStripping(); + void combined_fullSentence(); + void combined_noTime(); + void combined_onlyKeyword(); + void combined_generalSuppressed(); + void combined_contentHasAndType(); +}; + +void tst_ChineseNLP::initTestCase() +{ + // Initialize QCoreApplication for Qt test framework + if (!QCoreApplication::instance()) { + int argc = 0; + new QCoreApplication(argc, nullptr); + } + + m_engine = new SemanticRuleEngine(this); + + // Load all 4 rule files + const QString dir = rulesDir(); + QVERIFY2(QDir(dir).exists(), qPrintable(QStringLiteral("Rules dir not found: ") + dir)); + + const QStringList files = { "noise_rules.json", "time_rules.json", + "filetype_rules.json", "keyword_rules.json" }; + for (const QString &f : files) { + const QString path = dir + QLatin1Char('/') + f; + bool ok = m_engine->loadRuleFile(path); + QVERIFY2(ok, qPrintable(QStringLiteral("Failed to load: ") + path)); + } + + // Verify all groups loaded + QVERIFY(m_engine->hasGroup("time")); + QVERIFY(m_engine->hasGroup("filetype")); + QVERIFY(m_engine->hasGroup("keyword")); + QVERIFY(m_engine->hasGroup("noise")); + + const QStringList groups = m_engine->groupNames(); + QCOMPARE(groups.size(), 4); + + m_parser = new IntentParser(m_engine); + + // Verify default extractors are initialized + QStringList names = m_parser->extractorNames(); + QCOMPARE(names.size(), 3); + QVERIFY(names.contains("time")); + QVERIFY(names.contains("filetype")); + QVERIFY(names.contains("keyword")); +} + +void tst_ChineseNLP::cleanupTestCase() +{ + delete m_parser; + m_parser = nullptr; +} + +void tst_ChineseNLP::init() +{ + // Each test gets a fresh parse — no shared state between tests +} + +// ===== Time Preset Tests ===== + +void tst_ChineseNLP::timePreset_today() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("今天的文件"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::Today); +} + +void tst_ChineseNLP::timePreset_today_alt() +{ + // 今日 and 今日份 + ParsedIntent intent1; + m_parser->parse(QStringLiteral("今日的文档"), intent1); + QCOMPARE(intent1.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent1.timeConstraint.preset, TimePreset::Today); + + ParsedIntent intent2; + m_parser->parse(QStringLiteral("今日份图片"), intent2); + QCOMPARE(intent2.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent2.timeConstraint.preset, TimePreset::Today); +} + +void tst_ChineseNLP::timePreset_yesterday() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("昨天的文档"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::Yesterday); +} + +void tst_ChineseNLP::timePreset_yesterday_variants() +{ + const QStringList inputs = { QStringLiteral("昨日"), QStringLiteral("昨晚"), + QStringLiteral("昨天上午"), QStringLiteral("昨天下午"), + QStringLiteral("昨天晚上") }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input, intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::Yesterday); + } +} + +void tst_ChineseNLP::timePreset_dayBeforeYesterday() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("前天的图片"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::DayBeforeYesterday); +} + +void tst_ChineseNLP::timePreset_thisWeek_variants() +{ + const QStringList inputs = { QStringLiteral("本周"), QStringLiteral("这周"), + QStringLiteral("这个星期"), QStringLiteral("这一个星期") }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input, intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::ThisWeek); + } +} + +void tst_ChineseNLP::timePreset_lastWeek_variants() +{ + const QStringList inputs = { QStringLiteral("上周"), QStringLiteral("上个星期"), + QStringLiteral("上星期"), QStringLiteral("上一个星期") }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input, intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::LastWeek); + } +} + +void tst_ChineseNLP::timePreset_thisMonth_variants() +{ + const QStringList inputs = { QStringLiteral("本月"), QStringLiteral("这个月"), + QStringLiteral("当月") }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input, intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::ThisMonth); + } +} + +void tst_ChineseNLP::timePreset_lastMonth_variants() +{ + const QStringList inputs = { QStringLiteral("上个月"), QStringLiteral("上月") }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input, intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::LastMonth); + } +} + +void tst_ChineseNLP::timePreset_thisYear_variants() +{ + const QStringList inputs = { QStringLiteral("今年"), QStringLiteral("本年"), + QStringLiteral("这年") }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input, intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::ThisYear); + } +} + +void tst_ChineseNLP::timePreset_lastYear_variants() +{ + const QStringList inputs = { QStringLiteral("去年"), QStringLiteral("上一年") }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input, intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::LastYear); + } +} + +// ===== Time Custom Tests ===== + +void tst_ChineseNLP::timeCustom_year() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("2025年的文档"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Custom); + QCOMPARE(intent.timeConstraint.customStart.date().year(), 2025); + QCOMPARE(intent.timeConstraint.customStart.date().month(), 1); + QCOMPARE(intent.timeConstraint.customStart.date().day(), 1); + QCOMPARE(intent.timeConstraint.customEnd.date().year(), 2025); + QCOMPARE(intent.timeConstraint.customEnd.date().month(), 12); + QCOMPARE(intent.timeConstraint.customEnd.date().day(), 31); +} + +void tst_ChineseNLP::timeCustom_year_twoDigit() +{ + // Two-digit year: 25 -> 2025 + ParsedIntent intent; + m_parser->parse(QStringLiteral("25年的文档"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Custom); + QCOMPARE(intent.timeConstraint.customStart.date().year(), 2025); + QCOMPARE(intent.timeConstraint.customEnd.date().year(), 2025); +} + +// ===== File Type Tests ===== + +void tst_ChineseNLP::fileType_precise_pdf() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("pdf"), intent); + QVERIFY(intent.fileExtensions.contains("pdf")); + QCOMPARE(intent.fileExtensions.size(), 1); +} + +void tst_ChineseNLP::fileType_precise_word() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("word"), intent); + QVERIFY(setEquals(intent.fileExtensions, QStringList { "doc", "docx" })); +} + +void tst_ChineseNLP::fileType_precise_excel() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("excel"), intent); + QVERIFY(setEquals(intent.fileExtensions, QStringList { "xls", "xlsx" })); +} + +void tst_ChineseNLP::fileType_precise_ppt() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("ppt"), intent); + QVERIFY(setEquals(intent.fileExtensions, QStringList { "ppt", "pptx" })); +} + +void tst_ChineseNLP::fileType_category_image_variants() +{ + const QStringList inputs = { QStringLiteral("图片"), QStringLiteral("照片"), + QStringLiteral("截图"), QStringLiteral("壁纸"), + QStringLiteral("海报"), QStringLiteral("相片"), + QStringLiteral("表情包"), QStringLiteral("图") }; + const QStringList expectedExts = { "jpg", "jpeg", "png", "gif", "bmp", "webp", "svg" }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input, intent); + QVERIFY2(setEquals(intent.fileExtensions, expectedExts), + qPrintable(QStringLiteral("Failed for input: ") + input)); + } +} + +void tst_ChineseNLP::fileType_category_video_variants() +{ + const QStringList inputs = { QStringLiteral("视频"), QStringLiteral("录像"), + QStringLiteral("电影"), QStringLiteral("动画"), + QStringLiteral("短片"), QStringLiteral("片子") }; + const QStringList expectedExts = { "mp4", "avi", "mkv", "mov", "flv", "wmv", "webm" }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input, intent); + QVERIFY2(setEquals(intent.fileExtensions, expectedExts), + qPrintable(QStringLiteral("Failed for input: ") + input)); + } +} + +void tst_ChineseNLP::fileType_category_audio_variants() +{ + const QStringList inputs = { QStringLiteral("音频"), QStringLiteral("音乐"), + QStringLiteral("录音"), QStringLiteral("歌"), + QStringLiteral("语音") }; + const QStringList expectedExts = { "mp3", "wav", "flac", "aac", "ogg", "m4a" }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input, intent); + QVERIFY2(setEquals(intent.fileExtensions, expectedExts), + qPrintable(QStringLiteral("Failed for input: ") + input)); + } +} + +void tst_ChineseNLP::fileType_category_archive() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("压缩包"), intent); + QVERIFY(intent.fileExtensions.contains("zip")); + QVERIFY(intent.fileExtensions.contains("tar.gz")); + QVERIFY(intent.fileExtensions.contains("rar")); + QVERIFY(intent.fileExtensions.contains("7z")); +} + +void tst_ChineseNLP::fileType_category_application() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("安装包"), intent); + QVERIFY(intent.fileExtensions.contains("deb")); + QVERIFY(intent.fileExtensions.contains("sh")); +} + +void tst_ChineseNLP::fileType_category_designSource() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("源文件"), intent); + QVERIFY(intent.fileExtensions.contains("psd")); + QVERIFY(intent.fileExtensions.contains("ai")); +} + +void tst_ChineseNLP::fileType_general_document() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("文档"), intent); + const QStringList expectedExts = { "doc", "docx", "pdf", "txt", "wps", "rtf", "md", "odt" }; + QVERIFY(setEquals(intent.fileExtensions, expectedExts)); +} + +void tst_ChineseNLP::fileType_general_spreadsheet() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("表格"), intent); + QVERIFY(intent.fileExtensions.contains("xls")); + QVERIFY(intent.fileExtensions.contains("xlsx")); + QVERIFY(intent.fileExtensions.contains("csv")); +} + +void tst_ChineseNLP::fileType_general_presentation() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("幻灯片"), intent); + QVERIFY(intent.fileExtensions.contains("ppt")); + QVERIFY(intent.fileExtensions.contains("pptx")); +} + +// ===== Keyword Tests ===== + +void tst_ChineseNLP::keyword_contains_single() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("包含会议记录的文档"), intent); + QCOMPARE(intent.keywords.size(), 1); + QCOMPARE(intent.keywords.first(), QString("会议记录")); +} + +void tst_ChineseNLP::keyword_contains_multi() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("包含预算和收入的报告"), intent); + QCOMPARE(intent.keywords.size(), 2); + QVERIFY(intent.keywords.contains("预算")); + QVERIFY(intent.keywords.contains("收入")); +} + +void tst_ChineseNLP::keyword_named() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("名为方案A的文档"), intent); + QCOMPARE(intent.keywords.size(), 1); + QCOMPARE(intent.keywords.first(), QString("方案A")); +} + +void tst_ChineseNLP::keyword_contentHas() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("内容包含数据分析的报告"), intent); + QCOMPARE(intent.keywords.size(), 1); + QCOMPARE(intent.keywords.first(), QString("数据分析")); +} + +void tst_ChineseNLP::keyword_contentHas_multi() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("内容含有产品规划和市场调研的报告"), intent); + QCOMPARE(intent.keywords.size(), 2); + QVERIFY(intent.keywords.contains("产品规划")); + QVERIFY(intent.keywords.contains("市场调研")); +} + +// ===== Noise + Unconsumed Text Tests ===== + +void tst_ChineseNLP::noise_action_words() +{ + // "搜索" is noise; "上周" is time; "图片" is filetype + ParsedIntent intent; + m_parser->parse(QStringLiteral("搜索上周的图片"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::LastWeek); + // Filetype should be matched + QVERIFY(!intent.fileExtensions.isEmpty()); + // No keywords since all text is consumed + QVERIFY(intent.keywords.isEmpty()); +} + +void tst_ChineseNLP::noise_polite_words() +{ + // "请帮我找" consumed as noise; "今天" time; "文档" filetype + ParsedIntent intent; + m_parser->parse(QStringLiteral("请帮我找今天的文档"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::Today); + QVERIFY(!intent.fileExtensions.isEmpty()); + // All text consumed by noise + time + filetype + QVERIFY(intent.keywords.isEmpty()); +} + +void tst_ChineseNLP::noise_suffix_words() +{ + // "昨天上午" time; "的照片" noise_suffix + ParsedIntent intent; + m_parser->parse(QStringLiteral("昨天上午的照片"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::Yesterday); + QVERIFY(!intent.fileExtensions.isEmpty()); + QVERIFY(intent.keywords.isEmpty()); +} + +// ===== End-to-End Combined Tests ===== + +void tst_ChineseNLP::combined_timeAndFiletype() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("今天的图片"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::Today); + const QStringList imageExts = { "jpg", "jpeg", "png", "gif", "bmp", "webp", "svg" }; + QVERIFY(setEquals(intent.fileExtensions, imageExts)); + // "的" is consumed by noise_suffix "的图片" + QVERIFY(intent.keywords.isEmpty()); +} + +void tst_ChineseNLP::combined_timeAndFiletype_multi() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("今天的图片和视频"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::Today); + // Should contain both image and video extensions + QVERIFY(intent.fileExtensions.contains("jpg")); + QVERIFY(intent.fileExtensions.contains("png")); + QVERIFY(intent.fileExtensions.contains("mp4")); + QVERIFY(intent.fileExtensions.contains("mkv")); + QVERIFY(intent.fileExtensions.contains("avi")); + QVERIFY(intent.keywords.isEmpty()); +} + +void tst_ChineseNLP::combined_timeAndFiletype_all() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("今天的图片和视频和音频"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::Today); + QVERIFY(intent.fileExtensions.contains("jpg")); + QVERIFY(intent.fileExtensions.contains("mp4")); + QVERIFY(intent.fileExtensions.contains("mp3")); + QVERIFY(intent.keywords.isEmpty()); +} + +void tst_ChineseNLP::combined_timeAndKeyword() +{ + // "今天" time, "包含会议记录的" keyword pattern, "文档" filetype + // But since keyword pattern matches, filetype_document_general is skipped + // because keyword_extractor returns early. The filetype extractor runs + // before keyword extractor and matches "文档". + ParsedIntent intent; + m_parser->parse(QStringLiteral("今天包含会议记录的文档"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::Today); + QVERIFY(intent.keywords.contains("会议记录")); + // "文档" matches filetype_document_general + QVERIFY(!intent.fileExtensions.isEmpty()); +} + +void tst_ChineseNLP::combined_filetypeAndKeyword() +{ + // "名为方案A的" → keyword "方案A"; "pdf" → filetype + ParsedIntent intent; + m_parser->parse(QStringLiteral("名为方案A的pdf"), intent); + QVERIFY(intent.fileExtensions.contains("pdf")); + QCOMPARE(intent.keywords.size(), 1); + QCOMPARE(intent.keywords.first(), QString("方案A")); +} + +void tst_ChineseNLP::combined_timeAndFiletypeAndKeyword() +{ + // "昨天" time, "视频" filetype (priority 150, non-general), + // "包含报告的" keyword → "报告" + // Note: "报告" also matches filetype_document_general but it's general + // and gets skipped since video exts are already in seenExtensions + ParsedIntent intent; + m_parser->parse(QStringLiteral("昨天的视频和包含报告的文档"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::Yesterday); + // Video extensions + QVERIFY(intent.fileExtensions.contains("mp4")); + QVERIFY(intent.fileExtensions.contains("avi")); + // Keyword extracted from structured pattern + QCOMPARE(intent.keywords.size(), 1); + QCOMPARE(intent.keywords.first(), QString("报告")); +} + +void tst_ChineseNLP::combined_noiseStripping() +{ + // "帮我找" noise_action, "今天" time, "会议" unconsumed → keyword, "文档" filetype + ParsedIntent intent; + m_parser->parse(QStringLiteral("帮我找今天的会议文档"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::Today); + QVERIFY(!intent.fileExtensions.isEmpty()); + QCOMPARE(intent.keywords.size(), 1); + QCOMPARE(intent.keywords.first(), QString("会议")); +} + +void tst_ChineseNLP::combined_fullSentence() +{ + // "请搜索上周的图片和视频" → noise(请,搜索) + time(上周) + filetype(图片,视频) + ParsedIntent intent; + m_parser->parse(QStringLiteral("请搜索上周的图片和视频"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::LastWeek); + QVERIFY(intent.fileExtensions.contains("jpg")); + QVERIFY(intent.fileExtensions.contains("mp4")); + QVERIFY(intent.keywords.isEmpty()); +} + +void tst_ChineseNLP::combined_noTime() +{ + // No time, keyword from "包含数据", filetype from "表格" + ParsedIntent intent; + m_parser->parse(QStringLiteral("包含数据的表格"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::None); + QVERIFY(!intent.fileExtensions.isEmpty()); + QVERIFY(intent.fileExtensions.contains("xls")); + QCOMPARE(intent.keywords.size(), 1); + QCOMPARE(intent.keywords.first(), QString("数据")); +} + +void tst_ChineseNLP::combined_onlyKeyword() +{ + // No time, no filetype, only unconsumed text as keyword + ParsedIntent intent; + m_parser->parse(QStringLiteral("会议记录"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::None); + QVERIFY(intent.fileExtensions.isEmpty()); + QCOMPARE(intent.keywords.size(), 1); + QCOMPARE(intent.keywords.first(), QString("会议记录")); +} + +void tst_ChineseNLP::combined_generalSuppressed() +{ + // "pdf" precise (priority 200) wins; "文档" general (priority 100) suppressed + ParsedIntent intent; + m_parser->parse(QStringLiteral("pdf文档"), intent); + QCOMPARE(intent.fileExtensions.size(), 1); + QCOMPARE(intent.fileExtensions.first(), QString("pdf")); +} + +void tst_ChineseNLP::combined_contentHasAndType() +{ + // "内容包含测试的报告" → keyword "测试", filetype "报告" (document general) + ParsedIntent intent; + m_parser->parse(QStringLiteral("内容包含测试的报告"), intent); + QVERIFY(intent.keywords.contains("测试")); + QVERIFY(!intent.fileExtensions.isEmpty()); + // "报告" is in filetype_document_general pattern + QVERIFY(intent.fileExtensions.contains("doc")); + QVERIFY(intent.fileExtensions.contains("pdf")); +} + +// ===== New Time Custom Tests ===== + +void tst_ChineseNLP::timeCustom_month() +{ + // "12月" → this month 12 + ParsedIntent intent; + m_parser->parse(QStringLiteral("12月的文档"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Custom); + QCOMPARE(intent.timeConstraint.customStart.date().month(), 12); + QCOMPARE(intent.timeConstraint.customStart.date().day(), 1); + // End should be last day of December + QCOMPARE(intent.timeConstraint.customEnd.date().month(), 12); + + // "5月份" — same month, different syntax + ParsedIntent intent2; + m_parser->parse(QStringLiteral("5月份的图片"), intent2); + QCOMPARE(intent2.timeConstraint.kind, TimeConstraintKind::Custom); + QCOMPARE(intent2.timeConstraint.customStart.date().month(), 5); +} + +void tst_ChineseNLP::timeCustom_yearMonth() +{ + // "2025年12月" → year=2025, month=12 + ParsedIntent intent; + m_parser->parse(QStringLiteral("2025年12月的文档"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Custom); + QCOMPARE(intent.timeConstraint.customStart.date().year(), 2025); + QCOMPARE(intent.timeConstraint.customStart.date().month(), 12); + QCOMPARE(intent.timeConstraint.customStart.date().day(), 1); + QCOMPARE(intent.timeConstraint.customEnd.date().year(), 2025); + QCOMPARE(intent.timeConstraint.customEnd.date().month(), 12); +} + +void tst_ChineseNLP::timeCustom_yearMonth_separators() +{ + // "2025-12" — dash separator + ParsedIntent intent1; + m_parser->parse(QStringLiteral("2025-12的图片"), intent1); + QCOMPARE(intent1.timeConstraint.kind, TimeConstraintKind::Custom); + QCOMPARE(intent1.timeConstraint.customStart.date().year(), 2025); + QCOMPARE(intent1.timeConstraint.customStart.date().month(), 12); + + // "2025/12" — slash separator + ParsedIntent intent2; + m_parser->parse(QStringLiteral("2025/12的视频"), intent2); + QCOMPARE(intent2.timeConstraint.kind, TimeConstraintKind::Custom); + QCOMPARE(intent2.timeConstraint.customStart.date().year(), 2025); + QCOMPARE(intent2.timeConstraint.customStart.date().month(), 12); + + // "25.12" — dot separator, 2-digit year + ParsedIntent intent3; + m_parser->parse(QStringLiteral("25.12的文件"), intent3); + QCOMPARE(intent3.timeConstraint.kind, TimeConstraintKind::Custom); + QCOMPARE(intent3.timeConstraint.customStart.date().year(), 2025); + QCOMPARE(intent3.timeConstraint.customStart.date().month(), 12); +} + +void tst_ChineseNLP::timeCustom_date() +{ + // "12月5日" → this year, Dec 5 + ParsedIntent intent; + m_parser->parse(QStringLiteral("12月5日的文档"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Custom); + QCOMPARE(intent.timeConstraint.customStart.date().month(), 12); + QCOMPARE(intent.timeConstraint.customStart.date().day(), 5); + QCOMPARE(intent.timeConstraint.customEnd.date().month(), 12); + QCOMPARE(intent.timeConstraint.customEnd.date().day(), 5); + QCOMPARE(intent.timeConstraint.customStart.date().year(), QDate::currentDate().year()); +} + +void tst_ChineseNLP::timeCustom_dateSpoken() +{ + // "3月8号" — spoken form with 号 + ParsedIntent intent; + m_parser->parse(QStringLiteral("3月8号的图片"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Custom); + QCOMPARE(intent.timeConstraint.customStart.date().month(), 3); + QCOMPARE(intent.timeConstraint.customStart.date().day(), 8); +} + +void tst_ChineseNLP::timeCustom_fullDate() +{ + // "2025年12月30日" — the specific example from requirements + ParsedIntent intent; + m_parser->parse(QStringLiteral("2025年12月30日的文档"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Custom); + QCOMPARE(intent.timeConstraint.customStart.date().year(), 2025); + QCOMPARE(intent.timeConstraint.customStart.date().month(), 12); + QCOMPARE(intent.timeConstraint.customStart.date().day(), 30); + QCOMPARE(intent.timeConstraint.customEnd.date().year(), 2025); + QCOMPARE(intent.timeConstraint.customEnd.date().month(), 12); + QCOMPARE(intent.timeConstraint.customEnd.date().day(), 30); + // Verify time boundaries + QCOMPARE(intent.timeConstraint.customStart.time().hour(), 0); + QCOMPARE(intent.timeConstraint.customEnd.time().hour(), 23); +} + +void tst_ChineseNLP::timeCustom_fullDate_separators() +{ + // "2025-12-05" — dash format + ParsedIntent intent1; + m_parser->parse(QStringLiteral("2025-12-05的文档"), intent1); + QCOMPARE(intent1.timeConstraint.kind, TimeConstraintKind::Custom); + QCOMPARE(intent1.timeConstraint.customStart.date().year(), 2025); + QCOMPARE(intent1.timeConstraint.customStart.date().month(), 12); + QCOMPARE(intent1.timeConstraint.customStart.date().day(), 5); + + // "2025/12/5" — slash format (no leading zero) + ParsedIntent intent2; + m_parser->parse(QStringLiteral("2025/12/5的文件"), intent2); + QCOMPARE(intent2.timeConstraint.kind, TimeConstraintKind::Custom); + QCOMPARE(intent2.timeConstraint.customStart.date().year(), 2025); + QCOMPARE(intent2.timeConstraint.customStart.date().month(), 12); + QCOMPARE(intent2.timeConstraint.customStart.date().day(), 5); + + // "2025.12.5" — dot format + ParsedIntent intent3; + m_parser->parse(QStringLiteral("2025.12.5的图片"), intent3); + QCOMPARE(intent3.timeConstraint.kind, TimeConstraintKind::Custom); + QCOMPARE(intent3.timeConstraint.customStart.date().year(), 2025); + QCOMPARE(intent3.timeConstraint.customStart.date().month(), 12); + QCOMPARE(intent3.timeConstraint.customStart.date().day(), 5); +} + +void tst_ChineseNLP::timeCustom_yesterday_variants_all() +{ + // "昨天下午" and "昨天晚上" — these are multi-char variants + ParsedIntent intent1; + m_parser->parse(QStringLiteral("昨天下午的图片"), intent1); + QCOMPARE(intent1.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent1.timeConstraint.preset, TimePreset::Yesterday); + + ParsedIntent intent2; + m_parser->parse(QStringLiteral("昨天晚上的视频"), intent2); + QCOMPARE(intent2.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent2.timeConstraint.preset, TimePreset::Yesterday); +} + +void tst_ChineseNLP::timeCustom_lastYear_extra() +{ + // "去年一整年" — not in current rules, but in requirements + // Current rules only have "去年|上一年". Test that "去年" works. + ParsedIntent intent; + m_parser->parse(QStringLiteral("去年的文档"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::LastYear); +} + +// ===== Filetype All-Synonyms Tests (from requirements) ===== + +void tst_ChineseNLP::fileType_document_general_allSynonyms() +{ + // Requirements 2.3.2.2.2: 文档, 文件, 报告, 文章, 方案, 文本, 资料, 笔记, 稿件 + const QStringList inputs = { + QStringLiteral("文档"), QStringLiteral("文件"), QStringLiteral("报告"), + QStringLiteral("文章"), QStringLiteral("方案"), QStringLiteral("文本"), + QStringLiteral("资料"), QStringLiteral("笔记"), QStringLiteral("稿件") + }; + const QStringList expectedExts = {"doc", "docx", "pdf", "txt", "wps", "rtf", "md", "odt"}; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input, intent); + QVERIFY2(setEquals(intent.fileExtensions, expectedExts), + qPrintable(QStringLiteral("Failed for input: ") + input + + QStringLiteral(" got: ") + intent.fileExtensions.join(","))); + } +} + +void tst_ChineseNLP::fileType_spreadsheet_general_allSynonyms() +{ + // Requirements: 表格, 统计表, 报表, 名单, 数据表, 数据, 明细 + // NOTE: "数据" is excluded from rules due to high false-positive risk + const QStringList inputs = { + QStringLiteral("表格"), QStringLiteral("统计表"), QStringLiteral("报表"), + QStringLiteral("名单"), QStringLiteral("数据表"), QStringLiteral("明细") + }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input, intent); + QVERIFY2(intent.fileExtensions.contains("xls"), + qPrintable(QStringLiteral("Missing xls for: ") + input)); + QVERIFY2(intent.fileExtensions.contains("xlsx"), + qPrintable(QStringLiteral("Missing xlsx for: ") + input)); + QVERIFY2(intent.fileExtensions.contains("csv"), + qPrintable(QStringLiteral("Missing csv for: ") + input)); + } +} + +void tst_ChineseNLP::filetype_presentation_general_allSynonyms() +{ + // Requirements: 幻灯片, 演示文稿, 汇报, 课件, 宣讲 + const QStringList inputs = { + QStringLiteral("幻灯片"), QStringLiteral("演示文稿"), QStringLiteral("汇报"), + QStringLiteral("课件"), QStringLiteral("宣讲") + }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input, intent); + QVERIFY2(intent.fileExtensions.contains("ppt"), + qPrintable(QStringLiteral("Missing ppt for: ") + input)); + QVERIFY2(intent.fileExtensions.contains("pptx"), + qPrintable(QStringLiteral("Missing pptx for: ") + input)); + } +} + +void tst_ChineseNLP::fileType_image_allSynonyms() +{ + // Requirements: 图片, 照片, 截图, 图, 壁纸, 海报, 相片, 表情包 + const QStringList inputs = { + QStringLiteral("图片"), QStringLiteral("照片"), QStringLiteral("截图"), + QStringLiteral("图"), QStringLiteral("壁纸"), QStringLiteral("海报"), + QStringLiteral("相片"), QStringLiteral("表情包") + }; + const QStringList expectedExts = {"jpg", "jpeg", "png", "gif", "bmp", "webp", "svg"}; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input, intent); + QVERIFY2(setEquals(intent.fileExtensions, expectedExts), + qPrintable(QStringLiteral("Failed for input: ") + input)); + } +} + +void tst_ChineseNLP::fileType_video_allSynonyms() +{ + // Requirements: 视频, 录像, 电影, 动画, 短片, 片子 + const QStringList inputs = { + QStringLiteral("视频"), QStringLiteral("录像"), QStringLiteral("电影"), + QStringLiteral("动画"), QStringLiteral("短片"), QStringLiteral("片子") + }; + const QStringList expectedExts = {"mp4", "avi", "mkv", "mov", "flv", "wmv", "webm"}; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input, intent); + QVERIFY2(setEquals(intent.fileExtensions, expectedExts), + qPrintable(QStringLiteral("Failed for input: ") + input)); + } +} + +void tst_ChineseNLP::fileType_audio_allSynonyms() +{ + // Requirements: 音频, 音乐, 录音, 歌, 语音 + const QStringList inputs = { + QStringLiteral("音频"), QStringLiteral("音乐"), QStringLiteral("录音"), + QStringLiteral("歌"), QStringLiteral("语音") + }; + const QStringList expectedExts = {"mp3", "wav", "flac", "aac", "ogg", "m4a"}; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input, intent); + QVERIFY2(setEquals(intent.fileExtensions, expectedExts), + qPrintable(QStringLiteral("Failed for input: ") + input)); + } +} + +void tst_ChineseNLP::fileType_archive_allSynonyms() +{ + // Requirements: 压缩包, 归档, 源码包, 打包文件, zip, rar + const QStringList inputs = { + QStringLiteral("压缩包"), QStringLiteral("归档"), QStringLiteral("源码包"), + QStringLiteral("打包文件"), QStringLiteral("zip"), QStringLiteral("rar") + }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input, intent); + QVERIFY2(intent.fileExtensions.contains("zip"), + qPrintable(QStringLiteral("Missing zip for: ") + input)); + } +} + +void tst_ChineseNLP::fileType_application_allSynonyms() +{ + // Requirements: 安装包, 软件, 应用, 脚本, 程序, 包 + // NOTE: "包" excluded from rules to avoid false positives with "表情包", "压缩包" + const QStringList inputs = { + QStringLiteral("安装包"), QStringLiteral("软件"), QStringLiteral("应用"), + QStringLiteral("脚本"), QStringLiteral("程序") + }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input, intent); + QVERIFY2(intent.fileExtensions.contains("deb"), + qPrintable(QStringLiteral("Missing deb for: ") + input)); + QVERIFY2(intent.fileExtensions.contains("sh"), + qPrintable(QStringLiteral("Missing sh for: ") + input)); + } +} + +void tst_ChineseNLP::fileType_design_source_allSynonyms() +{ + // Requirements: 源文件, 设计稿, psd, 矢量图, 工程文件 + const QStringList inputs = { + QStringLiteral("源文件"), QStringLiteral("设计稿"), QStringLiteral("矢量图"), + QStringLiteral("工程文件"), QStringLiteral("psd"), QStringLiteral("fig"), + QStringLiteral("sketch") + }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input, intent); + QVERIFY2(intent.fileExtensions.contains("psd"), + qPrintable(QStringLiteral("Missing psd for: ") + input)); + QVERIFY2(intent.fileExtensions.contains("ai"), + qPrintable(QStringLiteral("Missing ai for: ") + input)); + } +} + +// ===== Combined Time+Type Tests ===== + +void tst_ChineseNLP::combined_fullDateAndType() +{ + // Requirements example: "2025年12月30日的文档" + ParsedIntent intent; + m_parser->parse(QStringLiteral("2025年12月30日的文档"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Custom); + QCOMPARE(intent.timeConstraint.customStart.date().year(), 2025); + QCOMPARE(intent.timeConstraint.customStart.date().month(), 12); + QCOMPARE(intent.timeConstraint.customStart.date().day(), 30); + // "文档" matches filetype_document_general + QVERIFY(intent.fileExtensions.contains("doc")); + QVERIFY(intent.fileExtensions.contains("pdf")); + QVERIFY(intent.keywords.isEmpty()); +} + +void tst_ChineseNLP::combined_monthAndType() +{ + // "12月的图片" — month + image + ParsedIntent intent; + m_parser->parse(QStringLiteral("12月的图片"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Custom); + QCOMPARE(intent.timeConstraint.customStart.date().month(), 12); + QVERIFY(intent.fileExtensions.contains("jpg")); + QVERIFY(intent.fileExtensions.contains("png")); + QVERIFY(intent.keywords.isEmpty()); +} + +void tst_ChineseNLP::combined_yearAndType() +{ + // "2025年的视频" — year + video + ParsedIntent intent; + m_parser->parse(QStringLiteral("2025年的视频"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Custom); + QCOMPARE(intent.timeConstraint.customStart.date().year(), 2025); + QCOMPARE(intent.timeConstraint.customEnd.date().year(), 2025); + QVERIFY(intent.fileExtensions.contains("mp4")); + QVERIFY(intent.fileExtensions.contains("avi")); +} + +QObject *create_tst_ChineseNLP() +{ + return new tst_ChineseNLP(); +} + +#include "tst_chinese_nlp.moc" diff --git a/autotests/dfm-search-tests/tst_semantic_search.cpp b/autotests/dfm-search-tests/tst_semantic_search.cpp new file mode 100644 index 00000000..0aa6ae3c --- /dev/null +++ b/autotests/dfm-search-tests/tst_semantic_search.cpp @@ -0,0 +1,664 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#include +#include +#include +#include +#include + +#include "semantic/semanticruleengine.h" +#include "semantic/intentparser.h" + +using namespace DFMSEARCH; + +static bool buildGroupFromJson(const QByteArray &json, RuleGroup &outGroup) +{ + QJsonDocument doc = QJsonDocument::fromJson(json); + if (!doc.isObject()) { + return false; + } + QJsonObject root = doc.object(); + QJsonArray groups = root.value("groups").toArray(); + if (groups.isEmpty()) { + return false; + } + return SemanticRuleEngine::parseRuleGroupStatic(groups.at(0).toObject(), outGroup); +} + +// Helper: build a simple rule group JSON string +static QByteArray makeRuleJson(const QString &groupName, const QString &ruleId, + const QString &pattern, int priority, + const QVariantMap &metadata = {}) +{ + QJsonObject ruleObj; + ruleObj["id"] = ruleId; + ruleObj["pattern"] = pattern; + ruleObj["enabled"] = true; + ruleObj["priority"] = priority; + if (!metadata.isEmpty()) { + ruleObj["metadata"] = QJsonObject::fromVariantMap(metadata); + } + + QJsonObject ruleGroupObj; + ruleGroupObj["name"] = groupName; + ruleGroupObj["version"] = "1.0.0"; + ruleGroupObj["rules"] = QJsonArray({ruleObj}); + + QJsonObject root; + root["groups"] = QJsonArray({ruleGroupObj}); + + return QJsonDocument(root).toJson(QJsonDocument::Compact); +} + +// ===== tst_RuleEngine ===== + +class tst_RuleEngine : public QObject +{ + Q_OBJECT + +private Q_SLOTS: + void parseValidGroup(); + void parseEmptyGroup(); + void parsePriorityOrdering(); + void matchReturnsHighestPriority(); + void matchAllReturnsAll(); + void ruleMetadataAccess(); + void hasGroupCheck(); +}; + +void tst_RuleEngine::parseValidGroup() +{ + QByteArray json = makeRuleJson("test", "r1", "hello", 100); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QCOMPARE(group.name, QString("test")); + QCOMPARE(group.rules.size(), 1); + QCOMPARE(group.rules[0].id, QString("r1")); + QVERIFY(group.rules[0].regex.isValid()); +} + +void tst_RuleEngine::parseEmptyGroup() +{ + QByteArray json = R"=====({"groups": [{"name": "empty", "rules": []}]})====="; + + RuleGroup group; + QVERIFY(!buildGroupFromJson(json, group)); +} + +void tst_RuleEngine::parsePriorityOrdering() +{ + QJsonObject r1, r2, r3; + r1["id"] = "low"; r1["pattern"] = "test"; r1["priority"] = 10; + r2["id"] = "high"; r2["pattern"] = "test"; r2["priority"] = 200; + r3["id"] = "mid"; r3["pattern"] = "test"; r3["priority"] = 100; + + QJsonObject ruleGroup; + ruleGroup["name"] = "prio"; + ruleGroup["version"] = "1.0.0"; + ruleGroup["rules"] = QJsonArray({r1, r2, r3}); + + QJsonObject root; + root["groups"] = QJsonArray({ruleGroup}); + + RuleGroup group; + QVERIFY(SemanticRuleEngine::parseRuleGroupStatic(ruleGroup, group)); + QCOMPARE(group.rules.size(), 3); + + QStringList ids; + for (const Rule &r : group.rules) { + ids.append(r.id); + } + QVERIFY(ids.contains("low")); + QVERIFY(ids.contains("mid")); + QVERIFY(ids.contains("high")); +} + +void tst_RuleEngine::matchReturnsHighestPriority() +{ + QByteArray json = makeRuleJson("test_match", "r1", "abc", 200, + {{"level", "high"}}); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QVERIFY(group.rules[0].regex.match("abc").hasMatch()); +} + +void tst_RuleEngine::matchAllReturnsAll() +{ + QJsonObject r1, r2, r3; + r1["id"] = "r1"; r1["pattern"] = "cat"; r1["priority"] = 100; + r2["id"] = "r2"; r2["pattern"] = "dog"; r2["priority"] = 100; + r3["id"] = "r3"; r3["pattern"] = "bird"; r3["priority"] = 50; + + QJsonObject ruleGroup; + ruleGroup["name"] = "test_all"; + ruleGroup["version"] = "1.0.0"; + ruleGroup["rules"] = QJsonArray({r1, r2, r3}); + + QJsonObject root; + root["groups"] = QJsonArray({ruleGroup}); + + RuleGroup group; + QVERIFY(SemanticRuleEngine::parseRuleGroupStatic(ruleGroup, group)); + QCOMPARE(group.rules.size(), 3); +} + +void tst_RuleEngine::ruleMetadataAccess() +{ + QVariantMap meta; + meta["type"] = "preset"; + meta["preset"] = "today"; + QByteArray json = makeRuleJson("test_meta", "m1", "test", 100, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QCOMPARE(group.rules[0].metadata.value("type").toString(), QString("preset")); + QCOMPARE(group.rules[0].metadata.value("preset").toString(), QString("today")); +} + +void tst_RuleEngine::hasGroupCheck() +{ + SemanticRuleEngine engine; + QVERIFY(!engine.hasGroup("time")); + QVERIFY(!engine.hasGroup("filetype")); + QCOMPARE(engine.groupNames().size(), 0); +} + +// ===== tst_TimeExtraction ===== + +class tst_TimeExtraction : public QObject +{ + Q_OBJECT + +private Q_SLOTS: + void presetToday(); + void presetYesterday(); + void presetThisWeek(); + void presetThisMonth(); + void presetThisYear(); + void presetLastYear(); + void customYear(); + void customYearMonth(); + void customFullDate(); + void noMatch(); +}; + +void tst_TimeExtraction::presetToday() +{ + QVariantMap meta; + meta["type"] = "preset"; + meta["preset"] = "today"; + QByteArray json = makeRuleJson("time", "time_today", "today", 200, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QCOMPARE(group.rules[0].metadata.value("preset").toString(), QString("today")); + QVERIFY(group.rules[0].regex.isValid()); + QVERIFY(group.rules[0].regex.match("today").hasMatch()); + QVERIFY(!group.rules[0].regex.match("yesterday").hasMatch()); +} + +void tst_TimeExtraction::presetYesterday() +{ + QVariantMap meta; + meta["type"] = "preset"; + meta["preset"] = "yesterday"; + QByteArray json = makeRuleJson("time", "time_yesterday", "yesterday", 200, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QCOMPARE(group.rules[0].metadata.value("preset").toString(), QString("yesterday")); + QVERIFY(group.rules[0].regex.match("yesterday").hasMatch()); +} + +void tst_TimeExtraction::presetThisWeek() +{ + QVariantMap meta; + meta["type"] = "preset"; + meta["preset"] = "this_week"; + QByteArray json = makeRuleJson("time", "time_this_week", "this_week", 190, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QCOMPARE(group.rules[0].metadata.value("preset").toString(), QString("this_week")); +} + +void tst_TimeExtraction::presetThisMonth() +{ + QVariantMap meta; + meta["type"] = "preset"; + meta["preset"] = "this_month"; + QByteArray json = makeRuleJson("time", "time_this_month", "this_month", 180, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QCOMPARE(group.rules[0].metadata.value("preset").toString(), QString("this_month")); +} + +void tst_TimeExtraction::presetThisYear() +{ + QVariantMap meta; + meta["type"] = "preset"; + meta["preset"] = "this_year"; + QByteArray json = makeRuleJson("time", "time_this_year", "this_year", 170, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QCOMPARE(group.rules[0].metadata.value("preset").toString(), QString("this_year")); +} + +void tst_TimeExtraction::presetLastYear() +{ + QVariantMap meta; + meta["type"] = "preset"; + meta["preset"] = "last_year"; + QByteArray json = makeRuleJson("time", "time_last_year", "last_year", 170, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QCOMPARE(group.rules[0].metadata.value("preset").toString(), QString("last_year")); +} + +void tst_TimeExtraction::customYear() +{ + // Use programmatic JSON to avoid raw string delimiter conflict with regex patterns + QVariantMap meta; + meta["type"] = "custom"; + meta["format"] = "year"; + QByteArray json = makeRuleJson("time", "time_exact_year", + "(?\\d{2,4})year", 160, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + + auto match = group.rules[0].regex.match("2025year"); + QVERIFY(match.hasMatch()); + QCOMPARE(match.captured("year"), QString("2025")); +} + +void tst_TimeExtraction::customYearMonth() +{ + QVariantMap meta; + meta["type"] = "custom"; + meta["format"] = "year_month"; + QByteArray json = makeRuleJson("time", "time_exact_year_month", + "(?\\d{4})-(?\\d{1,2})", 150, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + + auto match = group.rules[0].regex.match("2025-12"); + QVERIFY(match.hasMatch()); + QCOMPARE(match.captured("year"), QString("2025")); + QCOMPARE(match.captured("month"), QString("12")); +} + +void tst_TimeExtraction::customFullDate() +{ + QVariantMap meta; + meta["type"] = "custom"; + meta["format"] = "full_date"; + QByteArray json = makeRuleJson("time", "time_exact_full_date", + "(?\\d{4})-(?\\d{1,2})-(?\\d{1,2})", + 140, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + + auto match = group.rules[0].regex.match("2025-03-15"); + QVERIFY(match.hasMatch()); + QCOMPARE(match.captured("year"), QString("2025")); + QCOMPARE(match.captured("month"), QString("03")); + QCOMPARE(match.captured("day"), QString("15")); +} + +void tst_TimeExtraction::noMatch() +{ + QVariantMap meta; + meta["type"] = "preset"; + meta["preset"] = "today"; + QByteArray json = makeRuleJson("time", "time_today", "today", 200, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QVERIFY(!group.rules[0].regex.match("random text without match").hasMatch()); +} + +// ===== tst_FileTypeExtraction ===== + +class tst_FileTypeExtraction : public QObject +{ + Q_OBJECT + +private Q_SLOTS: + void precisePdf(); + void preciseWord(); + void preciseExcel(); + void precisePpt(); + void imageType(); + void videoType(); + void audioType(); + void genericDocument(); + void genericSpreadsheet(); + void genericPresentation(); + void archiveType(); + void extensionsList(); + void generalFlag(); +}; + +void tst_FileTypeExtraction::precisePdf() +{ + QVariantMap meta; + meta["extensions"] = QStringList({"pdf"}); + QByteArray json = makeRuleJson("filetype", "filetype_pdf", "pdf", 200, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QVERIFY(group.rules[0].regex.match("pdf").hasMatch()); + QCOMPARE(group.rules[0].metadata.value("extensions").toStringList(), QStringList({"pdf"})); +} + +void tst_FileTypeExtraction::preciseWord() +{ + QVariantMap meta; + meta["extensions"] = QStringList({"doc", "docx"}); + meta["fileTypes"] = QStringList({"doc"}); + QByteArray json = makeRuleJson("filetype", "filetype_word", "word|doc|docx", 200, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QVERIFY(group.rules[0].regex.match("word").hasMatch()); + QVERIFY(group.rules[0].regex.match("docx").hasMatch()); + QVERIFY(!group.rules[0].regex.match("pdf").hasMatch()); +} + +void tst_FileTypeExtraction::preciseExcel() +{ + QVariantMap meta; + meta["extensions"] = QStringList({"xls", "xlsx"}); + QByteArray json = makeRuleJson("filetype", "filetype_excel", "excel|xls|xlsx", 200, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QVERIFY(group.rules[0].regex.match("excel").hasMatch()); + QVERIFY(group.rules[0].regex.match("xlsx").hasMatch()); +} + +void tst_FileTypeExtraction::precisePpt() +{ + QVariantMap meta; + meta["extensions"] = QStringList({"ppt", "pptx"}); + QByteArray json = makeRuleJson("filetype", "filetype_ppt", "ppt|pptx", 200, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QVERIFY(group.rules[0].regex.match("ppt").hasMatch()); + QVERIFY(group.rules[0].regex.match("pptx").hasMatch()); +} + +void tst_FileTypeExtraction::imageType() +{ + QVariantMap meta; + meta["extensions"] = QStringList({"jpg", "png", "gif"}); + meta["fileTypes"] = QStringList({"pic"}); + QByteArray json = makeRuleJson("filetype", "filetype_image", "image", 150, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QCOMPARE(group.rules[0].metadata.value("fileTypes").toStringList(), QStringList({"pic"})); +} + +void tst_FileTypeExtraction::videoType() +{ + QVariantMap meta; + meta["extensions"] = QStringList({"mp4", "avi", "mkv"}); + meta["fileTypes"] = QStringList({"video"}); + QByteArray json = makeRuleJson("filetype", "filetype_video", "video", 150, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QCOMPARE(group.rules[0].metadata.value("fileTypes").toStringList(), QStringList({"video"})); +} + +void tst_FileTypeExtraction::audioType() +{ + QVariantMap meta; + meta["extensions"] = QStringList({"mp3", "wav", "flac"}); + meta["fileTypes"] = QStringList({"audio"}); + QByteArray json = makeRuleJson("filetype", "filetype_audio", "audio", 150, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QCOMPARE(group.rules[0].metadata.value("fileTypes").toStringList(), QStringList({"audio"})); +} + +void tst_FileTypeExtraction::genericDocument() +{ + QVariantMap meta; + meta["extensions"] = QStringList({"doc", "docx", "pdf", "txt"}); + meta["fileTypes"] = QStringList({"doc"}); + meta["general"] = true; + QByteArray json = makeRuleJson("filetype", "filetype_document_general", "document", 100, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QVERIFY(group.rules[0].metadata.value("general").toBool()); + QCOMPARE(group.rules[0].metadata.value("extensions").toStringList().size(), 4); +} + +void tst_FileTypeExtraction::genericSpreadsheet() +{ + QVariantMap meta; + meta["extensions"] = QStringList({"xls", "xlsx", "csv"}); + meta["general"] = true; + QByteArray json = makeRuleJson("filetype", "filetype_spreadsheet_general", "spreadsheet", 100, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QVERIFY(group.rules[0].metadata.value("general").toBool()); +} + +void tst_FileTypeExtraction::genericPresentation() +{ + QVariantMap meta; + meta["extensions"] = QStringList({"ppt", "pptx", "dps"}); + meta["general"] = true; + QByteArray json = makeRuleJson("filetype", "filetype_presentation_general", "presentation", 100, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QVERIFY(group.rules[0].metadata.value("general").toBool()); +} + +void tst_FileTypeExtraction::archiveType() +{ + QVariantMap meta; + meta["extensions"] = QStringList({"zip", "tar", "rar", "7z"}); + QByteArray json = makeRuleJson("filetype", "filetype_archive", "archive", 150, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QCOMPARE(group.rules[0].metadata.value("extensions").toStringList().size(), 4); +} + +void tst_FileTypeExtraction::extensionsList() +{ + QVariantMap meta; + meta["extensions"] = QStringList({"a", "b", "c", "d", "e"}); + QByteArray json = makeRuleJson("filetype", "ft", "test", 100, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QCOMPARE(group.rules[0].metadata.value("extensions").toStringList().size(), 5); +} + +void tst_FileTypeExtraction::generalFlag() +{ + QVariantMap meta; + meta["extensions"] = QStringList({"pdf"}); + QByteArray json = makeRuleJson("filetype", "ft_precise", "pdf", 200, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QVERIFY(!group.rules[0].metadata.value("general").toBool()); +} + +// ===== tst_KeywordExtraction ===== + +class tst_KeywordExtraction : public QObject +{ + Q_OBJECT + +private Q_SLOTS: + void containsPattern(); + void namedPattern(); + void contentHasPattern(); + void noMatch(); + void captureGroup(); + void multiKeywordFlag(); +}; + +void tst_KeywordExtraction::containsPattern() +{ + QVariantMap meta; + meta["capture_group"] = 1; + meta["multi_keyword"] = true; + QByteArray json = makeRuleJson("keyword", "keyword_contains", + "contains(.+?)(?:of|$)", 200, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + + auto match = group.rules[0].regex.match("contains meeting notes"); + QVERIFY(match.hasMatch()); + QCOMPARE(match.captured(1), QString(" meeting notes")); +} + +void tst_KeywordExtraction::namedPattern() +{ + QVariantMap meta; + meta["capture_group"] = 1; + meta["multi_keyword"] = false; + QByteArray json = makeRuleJson("keyword", "keyword_named", + "named (.+?)(?: of|$)", 200, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + + auto match = group.rules[0].regex.match("named report of"); + QVERIFY(match.hasMatch()); + QCOMPARE(match.captured(1), QString("report")); +} + +void tst_KeywordExtraction::contentHasPattern() +{ + QVariantMap meta; + meta["capture_group"] = 1; + meta["multi_keyword"] = true; + QByteArray json = makeRuleJson("keyword", "keyword_content_has", + "content(?: has| contains| includes)(.+?)(?: of|$)", + 200, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + + auto match = group.rules[0].regex.match("content includes budget data"); + QVERIFY(match.hasMatch()); + QCOMPARE(match.captured(1), QString(" budget data")); +} + +void tst_KeywordExtraction::noMatch() +{ + QVariantMap meta; + meta["capture_group"] = 1; + QByteArray json = makeRuleJson("keyword", "keyword_contains", + "contains(.+?)(?:of|$)", 200, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QVERIFY(!group.rules[0].regex.match("no keyword pattern here").hasMatch()); +} + +void tst_KeywordExtraction::captureGroup() +{ + QVariantMap meta; + meta["capture_group"] = 1; + QByteArray json = makeRuleJson("keyword", "k1", "contains(.+?)(?:of|$)", 200, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QCOMPARE(group.rules[0].metadata.value("capture_group").toInt(), 1); +} + +void tst_KeywordExtraction::multiKeywordFlag() +{ + QVariantMap meta; + meta["capture_group"] = 1; + meta["multi_keyword"] = true; + QByteArray json = makeRuleJson("keyword", "k1", "contains(.+?)(?:of|$)", 200, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QVERIFY(group.rules[0].metadata.value("multi_keyword").toBool()); +} + +// ===== tst_ParsedIntent ===== + +class tst_ParsedIntent : public QObject +{ + Q_OBJECT + +private Q_SLOTS: + void defaultState(); + void timeConstraintDefault(); + void timeConstraintPreset(); + void matchSpanValidity(); +}; + +void tst_ParsedIntent::defaultState() +{ + ParsedIntent intent; + QVERIFY(intent.timeConstraint.kind == TimeConstraintKind::None); + QVERIFY(intent.fileExtensions.isEmpty()); + QVERIFY(intent.keywords.isEmpty()); + QVERIFY(intent.consumedSpans.isEmpty()); +} + +void tst_ParsedIntent::timeConstraintDefault() +{ + TimeConstraint tc; + QVERIFY(!tc.isValid()); + QCOMPARE(tc.kind, TimeConstraintKind::None); +} + +void tst_ParsedIntent::timeConstraintPreset() +{ + TimeConstraint tc; + tc.kind = TimeConstraintKind::Preset; + tc.preset = TimePreset::Today; + QVERIFY(tc.isValid()); +} + +void tst_ParsedIntent::matchSpanValidity() +{ + MatchSpan span; + QVERIFY(!span.isValid()); + + span.start = 0; + span.end = 5; + span.ruleId = "test_rule"; + QVERIFY(span.isValid()); +} + +// ===== Factory functions ===== + +QObject *create_tst_RuleEngine() { return new tst_RuleEngine(); } +QObject *create_tst_TimeExtraction() { return new tst_TimeExtraction(); } +QObject *create_tst_FileTypeExtraction() { return new tst_FileTypeExtraction(); } +QObject *create_tst_KeywordExtraction() { return new tst_KeywordExtraction(); } +QObject *create_tst_ParsedIntent() { return new tst_ParsedIntent(); } + +#include "tst_semantic_search.moc" diff --git a/debian/libdfm-search.install b/debian/libdfm-search.install index d0fd2344..8c90bd37 100644 --- a/debian/libdfm-search.install +++ b/debian/libdfm-search.install @@ -1,2 +1,3 @@ usr/lib/*/libdfm-search*.so* -usr/bin/dfm-searcher \ No newline at end of file +usr/bin/dfm-searcher +share/deepin/dfm-search/semantic/rules/* \ No newline at end of file diff --git a/debian/libdfm6-search.install b/debian/libdfm6-search.install index 9c76e128..f4bb6e06 100644 --- a/debian/libdfm6-search.install +++ b/debian/libdfm6-search.install @@ -1,2 +1,3 @@ usr/lib/*/libdfm6-search*.so* -usr/bin/dfm-searcher \ No newline at end of file +usr/bin/dfm-searcher +share/deepin/dfm6-search/semantic/rules/* \ No newline at end of file diff --git a/include/dfm-search/dfm-search/dimensionextractor.h b/include/dfm-search/dfm-search/dimensionextractor.h new file mode 100644 index 00000000..c798a14f --- /dev/null +++ b/include/dfm-search/dfm-search/dimensionextractor.h @@ -0,0 +1,32 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef DIMENSIONEXTRACTOR_H +#define DIMENSIONEXTRACTOR_H + +#include + +DFM_SEARCH_BEGIN_NS + +class DimensionExtractor +{ +public: + virtual ~DimensionExtractor() = default; + + /** + * @brief Extract a dimension from the input text and populate the intent. + * @param input The raw natural language input + * @param intent The intent to populate with extracted data + */ + virtual void extract(const QString &input, ParsedIntent &intent) = 0; + + /** + * @brief Get the name of this extractor for debugging. + */ + virtual QString name() const = 0; +}; + +DFM_SEARCH_END_NS + +#endif // DIMENSIONEXTRACTOR_H diff --git a/include/dfm-search/dfm-search/semantic_types.h b/include/dfm-search/dfm-search/semantic_types.h new file mode 100644 index 00000000..f63be806 --- /dev/null +++ b/include/dfm-search/dfm-search/semantic_types.h @@ -0,0 +1,86 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef SEMANTIC_TYPES_H +#define SEMANTIC_TYPES_H + +#include +#include +#include +#include + +#include + +DFM_SEARCH_BEGIN_NS + +/** + * @brief Represents a consumed span in the input text matched by a rule. + */ +struct MatchSpan +{ + int start = -1; + int end = -1; + QString ruleId; + + bool isValid() const { return start >= 0 && end > start; } +}; + +/** + * @brief Enum for preset time periods. + */ +enum class TimePreset { + Today, + Yesterday, + DayBeforeYesterday, + ThisWeek, + LastWeek, + ThisMonth, + LastMonth, + ThisYear, + LastYear +}; + +/** + * @brief Enum for time constraint kinds. + */ +enum class TimeConstraintKind { + None, ///< No time constraint + Preset, ///< Preset period (today, yesterday, etc.) + Relative, ///< Relative time (last N days/hours) + Custom ///< Custom datetime range +}; + +/** + * @brief Represents a parsed time constraint from natural language. + */ +struct TimeConstraint +{ + TimeConstraintKind kind = TimeConstraintKind::None; + TimePreset preset = TimePreset::Today; + int relativeValue = 0; + TimeUnit relativeUnit = TimeUnit::Days; + QDateTime customStart; + QDateTime customEnd; + + bool isValid() const { return kind != TimeConstraintKind::None; } +}; + +/** + * @brief Represents the parsed intent from natural language input. + * + * This is the intermediate representation between NLP parsing + * and search query construction. Declared public for future + * structured API extensibility. + */ +struct ParsedIntent +{ + TimeConstraint timeConstraint; + QStringList fileExtensions; + QStringList keywords; + QList consumedSpans; +}; + +DFM_SEARCH_END_NS + +#endif // SEMANTIC_TYPES_H diff --git a/include/dfm-search/dfm-search/semanticsearcher.h b/include/dfm-search/dfm-search/semanticsearcher.h new file mode 100644 index 00000000..03775085 --- /dev/null +++ b/include/dfm-search/dfm-search/semanticsearcher.h @@ -0,0 +1,131 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef SEMANTICSEARCHER_H +#define SEMANTICSEARCHER_H + +#include + +#include +#include +#include + +DFM_SEARCH_BEGIN_NS + +class SemanticSearcherData; + +/** + * @brief The SemanticSearcher class provides natural language based file search. + * + * This class parses natural language queries (e.g., "today's pdf documents") + * into structured search conditions, then orchestrates parallel searches + * across filename, content, and OCR indexes. + * + * Usage: + * @code + * SemanticSearcher *searcher = new SemanticSearcher(this); + * connect(searcher, &SemanticSearcher::resultsFound, [](const SearchResultList &results) { + * for (const auto &r : results) { + * qDebug() << r.path(); + * } + * }); + * searcher->search("today's pdf documents"); + * @endcode + */ +class SemanticSearcher : public QObject +{ + Q_OBJECT + +public: + /** + * @brief Construct a semantic searcher + * @param parent Parent QObject + */ + explicit SemanticSearcher(QObject *parent = nullptr); + + /** + * @brief Destructor + */ + ~SemanticSearcher() override; + + /** + * @brief Get the current search status + */ + SearchStatus status() const; + + /** + * @brief Set the search timeout in seconds + * @param seconds Timeout duration (default 60, 0 to disable) + */ + void setSearchTimeout(int seconds); + + /** + * @brief Get the search timeout in seconds + */ + int searchTimeout() const; + + /** + * @brief Perform a semantic search with natural language input + * @param naturalLanguage The natural language query string + */ + void search(const QString &naturalLanguage); + + /** + * @brief Perform a synchronous semantic search + * + * Blocks the calling thread until all search engines complete or timeout. + * Uses QEventLoop internally, so it works from the GUI thread. + * @param naturalLanguage The natural language query string + * @return SearchResultExpected containing deduplicated results or an error + */ + SearchResultExpected searchSync(const QString &naturalLanguage); + + /** + * @brief Cancel the current search operation + */ + void cancel(); + +Q_SIGNALS: + /** + * @brief Emitted when a search operation starts + */ + void searchStarted(); + + /** + * @brief Emitted when search results are found + * @param results The found search results + */ + void resultsFound(const DFMSEARCH::SearchResultList &results); + + /** + * @brief Emitted when the search status changes + * @param status The new search status + */ + void statusChanged(SearchStatus status); + + /** + * @brief Emitted when a search operation completes + * @param results The list of all search results (deduplicated) + */ + void searchFinished(const DFMSEARCH::SearchResultList &results); + + /** + * @brief Emitted when a search operation is cancelled + */ + void searchCancelled(); + + /** + * @brief Emitted when an error occurs during search + * @param error The SearchError that occurred + */ + void errorOccurred(const DFMSEARCH::SearchError &error); + +private: + Q_DISABLE_COPY(SemanticSearcher) + std::unique_ptr d_ptr; +}; + +DFM_SEARCH_END_NS + +#endif // SEMANTICSEARCHER_H diff --git a/src/dfm-search/dfm-search-client/cli_options.cpp b/src/dfm-search/dfm-search-client/cli_options.cpp index 861edf00..e2b078d0 100644 --- a/src/dfm-search/dfm-search-client/cli_options.cpp +++ b/src/dfm-search/dfm-search-client/cli_options.cpp @@ -33,6 +33,9 @@ CliOptions::CliOptions() m_verboseOption(QStringList() << "verbose" << "v", "Enable verbose output with detailed result information"), + m_semanticOption(QStringList() << "semantic" + << "s", + "Enable semantic natural language search"), m_timeFieldOption(QStringList() << "time-field", "Time field to filter (birth or modify)", "field", "modify"), m_timeLastOption(QStringList() << "time-last", "Rolling time window (e.g., 3d, 2h, 30m)", "duration"), m_timeTodayOption(QStringList() << "time-today", "Filter files from today"), @@ -69,8 +72,9 @@ void CliOptions::setupOptions() m_parser.addOption(m_wildcardOption); m_parser.addOption(m_jsonOption); m_parser.addOption(m_verboseOption); + m_parser.addOption(m_semanticOption); - // 时间范围过滤选项 + // Time range filtering options m_parser.addOption(m_timeFieldOption); m_parser.addOption(m_timeLastOption); m_parser.addOption(m_timeTodayOption); @@ -90,7 +94,11 @@ void CliOptions::setupOptions() void CliOptions::printHelp() const { - std::cout << "Usage: dfm-searcher [options] " << std::endl; + std::cout << "Usage: dfm-searcher [options] [search_path]" << std::endl; + std::cout << std::endl; + std::cout << "Semantic Search:" << std::endl; + std::cout << " --semantic, -s Enable semantic natural language search" << std::endl; + std::cout << " Example: dfm-searcher -s \"recent 3 days images\" /home/user" << std::endl; std::cout << std::endl; std::cout << "Search Types:" << std::endl; std::cout << " --type= Search type (default: filename)" << std::endl; @@ -145,6 +153,12 @@ void CliOptions::printHelp() const std::cout << std::endl; std::cout << " # Realtime search with time filter" << std::endl; std::cout << " dfm-searcher --method=realtime --time-last=7d \"report\" /home/user" << std::endl; + std::cout << std::endl; + std::cout << " # Semantic search: find recent images" << std::endl; + std::cout << " dfm-searcher --semantic \"recent 3 days images\" /home/user" << std::endl; + std::cout << std::endl; + std::cout << " # Semantic search with JSON output" << std::endl; + std::cout << " dfm-searcher -s -j \"content contains meeting notes\" /home/user" << std::endl; } bool CliOptions::parse(QCoreApplication &app, SearchCliConfig &config) @@ -152,22 +166,46 @@ bool CliOptions::parse(QCoreApplication &app, SearchCliConfig &config) m_parser.process(app); QStringList positionalArgs = m_parser.positionalArguments(); - if (positionalArgs.size() < 2) { + if (positionalArgs.isEmpty()) { printHelp(); return false; } + // Semantic mode: only keyword is required, search path is optional + config.semanticMode = m_parser.isSet(m_semanticOption); config.keyword = positionalArgs.at(0); - config.searchPath = positionalArgs.at(1); + if (positionalArgs.size() >= 2) { + config.searchPath = positionalArgs.at(1); + } - // 验证搜索路径 - QFileInfo pathInfo(config.searchPath); - if (!pathInfo.exists() || !pathInfo.isDir()) { - std::cerr << "Error: Search path does not exist or is not a directory" << std::endl; + // Validate search path (not required in semantic mode) + if (!config.searchPath.isEmpty()) { + QFileInfo pathInfo(config.searchPath); + if (!pathInfo.exists() || !pathInfo.isDir()) { + std::cerr << "Error: Search path does not exist or is not a directory" << std::endl; + return false; + } + } else if (!config.semanticMode) { + std::cerr << "Error: Search path is required" << std::endl; + printHelp(); return false; } - // 解析搜索类型 + // In semantic mode, skip type/method/query parsing + if (config.semanticMode) { + config.jsonOutput = m_parser.isSet(m_jsonOption); + config.verbose = m_parser.isSet(m_verboseOption); + if (m_parser.isSet(m_maxPreviewOption)) { + bool ok; + int previewLength = m_parser.value(m_maxPreviewOption).toInt(&ok); + if (ok && previewLength > 0) { + config.maxPreviewLength = previewLength; + } + } + return true; + } + + // Parse search type (non-semantic mode only) QString typeStr = m_parser.value(m_typeOption); if (typeStr == "content") { config.searchType = SearchType::Content; diff --git a/src/dfm-search/dfm-search-client/cli_options.h b/src/dfm-search/dfm-search-client/cli_options.h index a96d782d..9edbd60e 100644 --- a/src/dfm-search/dfm-search-client/cli_options.h +++ b/src/dfm-search/dfm-search-client/cli_options.h @@ -47,7 +47,10 @@ struct SearchCliConfig // 文件名搜索选项 QString filenameKeyword; - // 时间范围过滤 + // Semantic mode + bool semanticMode = false; + + // Time range filtering bool hasTimeFilter = false; DFMSEARCH::TimeRangeFilter timeFilter; }; @@ -100,8 +103,9 @@ class CliOptions QCommandLineOption m_wildcardOption; QCommandLineOption m_jsonOption; QCommandLineOption m_verboseOption; + QCommandLineOption m_semanticOption; - // 时间范围过滤选项 + // Time range filtering options QCommandLineOption m_timeFieldOption; QCommandLineOption m_timeLastOption; QCommandLineOption m_timeTodayOption; diff --git a/src/dfm-search/dfm-search-client/main.cpp b/src/dfm-search/dfm-search-client/main.cpp index 9197773f..db1af343 100644 --- a/src/dfm-search/dfm-search-client/main.cpp +++ b/src/dfm-search/dfm-search-client/main.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include "cli_options.h" #include "output/text_output.h" @@ -156,7 +157,38 @@ int main(int argc, char *argv[]) return 1; } - // 创建搜索引擎 + // Semantic search mode + if (config.semanticMode) { + auto *semanticSearcher = new DFMSEARCH::SemanticSearcher(&app); + + OutputFormatter *formatter = createOutputFormatter(config, &app); + formatter->setSearchContext(config.keyword, config.searchPath, + SearchType::FileName, SearchMethod::Indexed); + + QObject::connect(formatter, &OutputFormatter::finished, &app, &QCoreApplication::quit); + QObject::connect(semanticSearcher, &DFMSEARCH::SemanticSearcher::searchStarted, [formatter]() { + formatter->outputSearchStarted(); + }); + QObject::connect(semanticSearcher, &DFMSEARCH::SemanticSearcher::resultsFound, [formatter](const SearchResultList &results) { + for (const auto &result : results) { + formatter->outputResult(result); + } + }); + QObject::connect(semanticSearcher, &DFMSEARCH::SemanticSearcher::searchFinished, [formatter](const SearchResultList &results) { + formatter->outputSearchFinished(results); + }); + QObject::connect(semanticSearcher, &DFMSEARCH::SemanticSearcher::searchCancelled, [formatter]() { + formatter->outputSearchCancelled(); + }); + QObject::connect(semanticSearcher, &DFMSEARCH::SemanticSearcher::errorOccurred, [formatter](const DFMSEARCH::SearchError &error) { + formatter->outputError(error); + }); + + semanticSearcher->search(config.keyword); + return app.exec(); + } + + // Create search engine (non-semantic mode) SearchEngine *engine = SearchFactory::createEngine(config.searchType, &app); if (!engine) { qCritical() << "Error: Failed to create search engine"; diff --git a/src/dfm-search/dfm-search-lib/dfm-search.cmake b/src/dfm-search/dfm-search-lib/dfm-search.cmake index 6696236b..74181757 100644 --- a/src/dfm-search/dfm-search-lib/dfm-search.cmake +++ b/src/dfm-search/dfm-search-lib/dfm-search.cmake @@ -18,6 +18,11 @@ add_library(${BIN_NAME} SHARED ${SRCS} ) +target_compile_definitions(${BIN_NAME} PRIVATE + CMAKE_INSTALL_PREFIX="${CMAKE_INSTALL_PREFIX}" + DFM_SEARCH_LIB_NAME="${BIN_NAME}" +) + target_link_libraries(${BIN_NAME} PUBLIC Qt${QT_VERSION_MAJOR}::Core Dtk${DFM_VERSION_MAJOR}::Core @@ -76,6 +81,12 @@ install(DIRECTORY FILES_MATCHING PATTERN "*.h" ) +# install semantic search rules (locale subdirectories preserved) +install(DIRECTORY + ${CMAKE_CURRENT_SOURCE_DIR}/semantic/rules/ + DESTINATION share/deepin/${BIN_NAME}/semantic/rules +) + # for pc file config - update to include all dependencies set(PC_LIBS_PRIVATE Qt${QT_VERSION_MAJOR}Core dtk${DFM_VERSION_MAJOR}core) set(PC_REQ_PRIVATE liblucene++ liblucene++-contrib) diff --git a/src/dfm-search/dfm-search-lib/semantic/extractors/filetypeextractor.cpp b/src/dfm-search/dfm-search-lib/semantic/extractors/filetypeextractor.cpp new file mode 100644 index 00000000..b52d876d --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/extractors/filetypeextractor.cpp @@ -0,0 +1,63 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "filetypeextractor.h" + +#include "semantic/semanticruleengine.h" + +DFM_SEARCH_BEGIN_NS + +FileTypeExtractor::FileTypeExtractor(SemanticRuleEngine *engine) + : m_engine(engine) +{ +} + +FileTypeExtractor::~FileTypeExtractor() = default; + +void FileTypeExtractor::extract(const QString &input, ParsedIntent &intent) +{ + if (!m_engine->hasGroup("filetype")) { + return; + } + + QStringList ruleIds; + const QList matches = m_engine->matchAll("filetype", input, &ruleIds); + + QSet seenExtensions; + + for (int i = 0; i < matches.size(); ++i) { + const QRegularExpressionMatch &m = matches[i]; + const QVariantMap metadata = m_engine->ruleMetadata("filetype", ruleIds[i]); + + const QStringList extensions = metadata.value("extensions").toStringList(); + const bool isGeneral = metadata.value("general", false).toBool(); + + // If this is a general/fallback type but we already have specific extensions, + // skip to avoid over-specificity dilution + if (isGeneral && !seenExtensions.isEmpty()) { + continue; + } + + for (const QString &ext : extensions) { + if (!seenExtensions.contains(ext)) { + seenExtensions.insert(ext); + } + } + + MatchSpan span; + span.start = m.capturedStart(); + span.end = m.capturedEnd(); + span.ruleId = ruleIds[i]; + intent.consumedSpans.append(span); + } + + intent.fileExtensions = seenExtensions.values(); +} + +QString FileTypeExtractor::name() const +{ + return QStringLiteral("filetype"); +} + +DFM_SEARCH_END_NS diff --git a/src/dfm-search/dfm-search-lib/semantic/extractors/filetypeextractor.h b/src/dfm-search/dfm-search-lib/semantic/extractors/filetypeextractor.h new file mode 100644 index 00000000..1f50faf1 --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/extractors/filetypeextractor.h @@ -0,0 +1,29 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef FILETYPEEXTRACTOR_H +#define FILETYPEEXTRACTOR_H + +#include + +DFM_SEARCH_BEGIN_NS + +class SemanticRuleEngine; + +class FileTypeExtractor : public DimensionExtractor +{ +public: + explicit FileTypeExtractor(SemanticRuleEngine *engine); + ~FileTypeExtractor() override; + + void extract(const QString &input, ParsedIntent &intent) override; + QString name() const override; + +private: + SemanticRuleEngine *m_engine; +}; + +DFM_SEARCH_END_NS + +#endif // FILETYPEEXTRACTOR_H diff --git a/src/dfm-search/dfm-search-lib/semantic/extractors/keywordextractor.cpp b/src/dfm-search/dfm-search-lib/semantic/extractors/keywordextractor.cpp new file mode 100644 index 00000000..0feda2dd --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/extractors/keywordextractor.cpp @@ -0,0 +1,199 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "keywordextractor.h" + +#include "semantic/semanticruleengine.h" + +#include + +DFM_SEARCH_BEGIN_NS + +KeywordExtractor::KeywordExtractor(SemanticRuleEngine *engine) + : m_engine(engine) +{ +} + +KeywordExtractor::~KeywordExtractor() = default; + +void KeywordExtractor::extract(const QString &input, ParsedIntent &intent) +{ + // Strategy 1: structured keyword patterns (e.g., "contains X and Y") + if (extractStructuredKeywords(input, intent)) { + return; + } + + // Strategy 2: extract unconsumed text regions + extractUnconsumedText(input, intent); +} + +bool KeywordExtractor::extractStructuredKeywords(const QString &input, ParsedIntent &intent) +{ + if (!m_engine->hasGroup("keyword")) { + return false; + } + + QString ruleId; + QRegularExpressionMatch match; + if (!m_engine->match("keyword", input, match, &ruleId)) { + return false; + } + + const QVariantMap metadata = m_engine->ruleMetadata("keyword", ruleId); + const int captureGroup = metadata.value("capture_group", 1).toInt(); + + if (captureGroup <= 0 || captureGroup > match.lastCapturedIndex()) { + return false; + } + + QString captured = match.captured(captureGroup).trimmed(); + if (captured.isEmpty()) { + return false; + } + + const bool multiKeyword = metadata.value("multi_keyword", false).toBool(); + + if (multiKeyword) { + intent.keywords = splitMultiKeywords(captured, metadata); + } else { + intent.keywords = { captured }; + } + + // Mark the entire matched region as consumed + MatchSpan span; + span.start = match.capturedStart(); + span.end = match.capturedEnd(); + span.ruleId = ruleId; + intent.consumedSpans.append(span); + + return true; +} + +void KeywordExtractor::extractUnconsumedText(const QString &input, ParsedIntent &intent) +{ + QList allSpans = intent.consumedSpans; + + // Also consume noise words + if (m_engine->hasGroup("noise")) { + QStringList noiseRuleIds; + const QList noiseMatches = + m_engine->matchAll("noise", input, &noiseRuleIds); + + for (int i = 0; i < noiseMatches.size(); ++i) { + MatchSpan span; + span.start = noiseMatches[i].capturedStart(); + span.end = noiseMatches[i].capturedEnd(); + span.ruleId = noiseRuleIds[i]; + allSpans.append(span); + } + } + + // Extract text not covered by any consumed span + const QString unconsumed = extractUnconsumedRegions(input, allSpans); + + if (unconsumed.isEmpty()) { + return; + } + + // Clean up punctuation and particles using pattern from rule metadata + // Default: strip whitespace + const QString cleanupPattern = QStringLiteral("[\\s]+"); + QRegularExpression cleanupRe(cleanupPattern); + + // Try to get a more specific cleanup pattern from keyword rules + // Load from ALL rules in the group (not just matching ones), + // since cleanup_pattern is a configuration property, not a per-match property. + if (m_engine->hasGroup("keyword")) { + const QStringList allRuleIds = m_engine->ruleIds("keyword"); + for (const QString &rid : allRuleIds) { + const QVariantMap meta = m_engine->ruleMetadata("keyword", rid); + const QString pattern = meta.value("cleanup_pattern").toString(); + if (!pattern.isEmpty()) { + cleanupRe.setPattern(pattern); + break; + } + } + } + + const QString cleaned = unconsumed.trimmed() + .replace(cleanupRe, " ") + .simplified(); + + if (cleaned.isEmpty()) { + return; + } + + intent.keywords = { cleaned }; +} + +QString KeywordExtractor::extractUnconsumedRegions(const QString &input, const QList &allSpans) const +{ + if (input.isEmpty()) { + return {}; + } + + // Build a set of consumed character positions + QVector consumed(input.size(), false); + for (const MatchSpan &span : allSpans) { + if (span.isValid() && span.end <= input.size()) { + for (int i = span.start; i < span.end; ++i) { + consumed[i] = true; + } + } + } + + // Extract unconsumed regions + QString result; + int regionStart = -1; + + for (int i = 0; i < input.size(); ++i) { + if (!consumed[i]) { + if (regionStart < 0) { + regionStart = i; + } + } else { + if (regionStart >= 0) { + result += input.mid(regionStart, i - regionStart) + " "; + regionStart = -1; + } + } + } + + // Trailing region + if (regionStart >= 0) { + result += input.mid(regionStart); + } + + return result.trimmed(); +} + +QStringList KeywordExtractor::splitMultiKeywords(const QString &text, const QVariantMap &metadata) +{ + // Default split on comma + QString splitPattern = QStringLiteral("[,]+"); + + // Try to get language-specific split pattern from metadata + const QString metaSplit = metadata.value("split_pattern").toString(); + if (!metaSplit.isEmpty()) { + splitPattern = metaSplit; + } + + QRegularExpression splitRe(splitPattern); + const QStringList parts = text.split(splitRe, Qt::SkipEmptyParts); + QStringList result; + for (const QString &part : parts) { + const QString trimmed = part.trimmed(); + if (!trimmed.isEmpty()) { + result.append(trimmed); + } + } + return result; +} + +QString KeywordExtractor::name() const +{ + return QStringLiteral("keyword"); +} + +DFM_SEARCH_END_NS diff --git a/src/dfm-search/dfm-search-lib/semantic/extractors/keywordextractor.h b/src/dfm-search/dfm-search-lib/semantic/extractors/keywordextractor.h new file mode 100644 index 00000000..07e0e935 --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/extractors/keywordextractor.h @@ -0,0 +1,34 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef KEYWORDEXTRACTOR_H +#define KEYWORDEXTRACTOR_H + +#include + +DFM_SEARCH_BEGIN_NS + +class SemanticRuleEngine; + +class KeywordExtractor : public DimensionExtractor +{ +public: + explicit KeywordExtractor(SemanticRuleEngine *engine); + ~KeywordExtractor() override; + + void extract(const QString &input, ParsedIntent &intent) override; + QString name() const override; + +private: + bool extractStructuredKeywords(const QString &input, ParsedIntent &intent); + void extractUnconsumedText(const QString &input, ParsedIntent &intent); + QString extractUnconsumedRegions(const QString &input, const QList &allSpans) const; + static QStringList splitMultiKeywords(const QString &text, const QVariantMap &metadata); + + SemanticRuleEngine *m_engine; +}; + +DFM_SEARCH_END_NS + +#endif // KEYWORDEXTRACTOR_H diff --git a/src/dfm-search/dfm-search-lib/semantic/extractors/timeextractor.cpp b/src/dfm-search/dfm-search-lib/semantic/extractors/timeextractor.cpp new file mode 100644 index 00000000..c4f77443 --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/extractors/timeextractor.cpp @@ -0,0 +1,214 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "timeextractor.h" + +#include "../semanticruleengine.h" + +#include +#include + +DFM_SEARCH_BEGIN_NS + +TimeExtractor::TimeExtractor(SemanticRuleEngine *engine) + : m_engine(engine) +{ +} + +TimeExtractor::~TimeExtractor() = default; + +void TimeExtractor::extract(const QString &input, ParsedIntent &intent) +{ + if (!m_engine->hasGroup("time")) { + return; + } + + QString ruleId; + QRegularExpressionMatch match; + if (!m_engine->match("time", input, match, &ruleId)) { + return; + } + + const QVariantMap metadata = m_engine->ruleMetadata("time", ruleId); + const QString typeStr = metadata.value("type").toString(); + TimeConstraint tc; + + if (typeStr == "preset") { + const QString presetStr = metadata.value("preset").toString(); + static const QMap kPresetMap = { + {"today", TimePreset::Today}, + {"yesterday", TimePreset::Yesterday}, + {"day_before_yesterday", TimePreset::DayBeforeYesterday}, + {"this_week", TimePreset::ThisWeek}, + {"last_week", TimePreset::LastWeek}, + {"this_month", TimePreset::ThisMonth}, + {"last_month", TimePreset::LastMonth}, + {"this_year", TimePreset::ThisYear}, + {"last_year", TimePreset::LastYear}, + }; + + if (kPresetMap.contains(presetStr)) { + tc.kind = TimeConstraintKind::Preset; + tc.preset = kPresetMap.value(presetStr); + } + } else if (typeStr == "custom") { + parseCustomTime(match, metadata, tc); + } + + if (tc.isValid()) { + intent.timeConstraint = tc; + MatchSpan span; + span.start = match.capturedStart(); + span.end = match.capturedEnd(); + span.ruleId = ruleId; + intent.consumedSpans.append(span); + } +} + +void TimeExtractor::parseCustomTime(const QRegularExpressionMatch &match, + const QVariantMap &metadata, + TimeConstraint &tc) +{ + Q_UNUSED(metadata); + + auto tryCapture = [&match](const QString &name) -> QString { + const QString val = match.captured(name); + return val.isNull() ? QString() : val; + }; + + // Load locale-aware number conversion from rule metadata + const QMap digitMap = mapFromVariant(metadata.value("digit_map")); + const QString tensUnit = metadata.value("tens_unit").toString(); + + const QDate today = QDate::currentDate(); + int year = today.year(); + int month = 0; + int day = 0; + + { + const QString yearStr = tryCapture("year"); + if (!yearStr.isEmpty()) { + year = localeAwareToInt(yearStr, digitMap, tensUnit); + if (year <= 0) { + qWarning() << "Invalid year:" << yearStr; + return; + } + if (year < 100) { + year += 2000; + } + } + } + + { + const QString monthStr = tryCapture("month"); + if (!monthStr.isEmpty()) { + month = localeAwareToInt(monthStr, digitMap, tensUnit); + if (month < 1 || month > 12) { + qWarning() << "Invalid month:" << monthStr; + return; + } + } + } + + { + const QString dayStr = tryCapture("day"); + if (!dayStr.isEmpty()) { + day = localeAwareToInt(dayStr, digitMap, tensUnit); + if (day < 1 || day > 31) { + qWarning() << "Invalid day:" << dayStr; + return; + } + } + } + + // Validate date + if (month > 0 && day > 0) { + QDate date(year, month, day); + if (!date.isValid()) { + qWarning() << "Invalid date:" << year << month << day; + return; + } + tc.kind = TimeConstraintKind::Custom; + tc.customStart = QDateTime(date, QTime(0, 0, 0)); + tc.customEnd = QDateTime(date, QTime(23, 59, 59)); + } else if (month > 0 && day == 0) { + // Year-month only: entire month + QDate monthStart(year, month, 1); + QDate monthEnd(year, month, monthStart.daysInMonth()); + tc.kind = TimeConstraintKind::Custom; + tc.customStart = QDateTime(monthStart, QTime(0, 0, 0)); + tc.customEnd = QDateTime(monthEnd, QTime(23, 59, 59)); + } else if (month == 0) { + // Year only: entire year + tc.kind = TimeConstraintKind::Custom; + tc.customStart = QDateTime(QDate(year, 1, 1), QTime(0, 0, 0)); + tc.customEnd = QDateTime(QDate(year, 12, 31), QTime(23, 59, 59)); + } +} + +int TimeExtractor::localeAwareToInt(const QString &input, + const QMap &digitMap, + const QString &tensUnit) +{ + if (input.isEmpty()) { + return -1; + } + + // Try direct integer conversion first (handles Arabic numerals) + bool ok = false; + int directValue = input.toInt(&ok); + if (ok) { + return directValue; + } + + // Single digit character from digit_map + if (input.size() == 1 && digitMap.contains(input)) { + return digitMap.value(input); + } + + // No digit_map or tens_unit configured — cannot parse locale-specific numbers + if (digitMap.isEmpty() || tensUnit.isEmpty()) { + return -1; + } + + // Two-character pattern: "XY" where Y is the tens unit (e.g., "十五" = 15) + if (input.size() == 2 && input.mid(1) == tensUnit) { + int prefix = digitMap.value(input.left(1), -1); + if (prefix > 1) { + return prefix * 10; + } + // "十" alone = 10 + if (prefix == -1 && input.left(1) == tensUnit) { + return 10; + } + } + + // Three-character pattern: "X Y Z" where Y is tens unit (e.g., "二十五" = 25) + if (input.size() == 3 && input.mid(1) == tensUnit) { + int prefix = digitMap.value(input.left(1), -1); + int suffix = digitMap.value(input.right(1), 0); + if (prefix > 0) { + return prefix * 10 + suffix; + } + } + + return -1; +} + +QMap TimeExtractor::mapFromVariant(const QVariant &variant) +{ + QMap result; + const QVariantMap map = variant.toMap(); + for (auto it = map.constBegin(); it != map.constEnd(); ++it) { + result.insert(it.key(), it.value().toInt()); + } + return result; +} + +QString TimeExtractor::name() const +{ + return QStringLiteral("time"); +} + +DFM_SEARCH_END_NS diff --git a/src/dfm-search/dfm-search-lib/semantic/extractors/timeextractor.h b/src/dfm-search/dfm-search-lib/semantic/extractors/timeextractor.h new file mode 100644 index 00000000..7fce9c24 --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/extractors/timeextractor.h @@ -0,0 +1,52 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef TIMEEXTRACTOR_H +#define TIMEEXTRACTOR_H + +#include + +#include + +DFM_SEARCH_BEGIN_NS + +class SemanticRuleEngine; + +class TimeExtractor : public DimensionExtractor +{ +public: + explicit TimeExtractor(SemanticRuleEngine *engine); + ~TimeExtractor() override; + + void extract(const QString &input, ParsedIntent &intent) override; + QString name() const override; + +private: + void parseCustomTime(const QRegularExpressionMatch &match, const QVariantMap &metadata, TimeConstraint &tc); + + /** + * @brief Convert a string to int using locale-aware digit mapping. + * + * First tries direct integer conversion (Arabic numerals). + * Falls back to digit_map lookup and positional tens-unit parsing. + * @param input The string to convert + * @param digitMap Mapping of locale-specific digit characters to integers (from rule metadata) + * @param tensUnit The character representing the tens place (from rule metadata, e.g. "十") + * @return The integer value, or -1 if conversion fails + */ + static int localeAwareToInt(const QString &input, + const QMap &digitMap, + const QString &tensUnit); + + /** + * @brief Convert a QVariantMap (from JSON) to a QMap. + */ + static QMap mapFromVariant(const QVariant &variant); + + SemanticRuleEngine *m_engine; +}; + +DFM_SEARCH_END_NS + +#endif // TIMEEXTRACTOR_H diff --git a/src/dfm-search/dfm-search-lib/semantic/intentparser.cpp b/src/dfm-search/dfm-search-lib/semantic/intentparser.cpp new file mode 100644 index 00000000..267fd3d9 --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/intentparser.cpp @@ -0,0 +1,52 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "intentparser.h" + +#include "semanticruleengine.h" +#include "extractors/filetypeextractor.h" +#include "extractors/keywordextractor.h" +#include "extractors/timeextractor.h" + +DFM_SEARCH_BEGIN_NS + +IntentParser::IntentParser(SemanticRuleEngine *engine) + : m_engine(engine) +{ + initDefaultExtractors(); +} + +IntentParser::~IntentParser() = default; + +void IntentParser::parse(const QString &input, ParsedIntent &intent) +{ + for (DimensionExtractor *extractor : m_extractors) { + extractor->extract(input, intent); + } +} + +void IntentParser::addExtractor(std::unique_ptr extractor) +{ + m_extractors.push_back(extractor.get()); + m_extractorOwners.push_back(std::move(extractor)); +} + +QStringList IntentParser::extractorNames() const +{ + QStringList names; + for (DimensionExtractor *e : m_extractors) { + names.append(e->name()); + } + return names; +} + +void IntentParser::initDefaultExtractors() +{ + // Order matters: keyword MUST be last (depends on consumedSpans) + addExtractor(std::make_unique(m_engine)); + addExtractor(std::make_unique(m_engine)); + addExtractor(std::make_unique(m_engine)); +} + +DFM_SEARCH_END_NS diff --git a/src/dfm-search/dfm-search-lib/semantic/intentparser.h b/src/dfm-search/dfm-search-lib/semantic/intentparser.h new file mode 100644 index 00000000..39cab098 --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/intentparser.h @@ -0,0 +1,59 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef INTENTPARSER_H +#define INTENTPARSER_H + +#include +#include +#include + +#include +#include + +DFM_SEARCH_BEGIN_NS + +class SemanticRuleEngine; + +/** + * @brief Orchestrates dimension extractors to parse natural language into intent. + * + * Extractors run in order. KeywordExtractor MUST be last + * because it relies on consumedSpans from earlier extractors. + */ +class IntentParser +{ +public: + explicit IntentParser(SemanticRuleEngine *engine); + ~IntentParser(); + + /** + * @brief Parse natural language input into a structured intent. + * @param input The raw natural language string + * @param intent Output: parsed intent + */ + void parse(const QString &input, ParsedIntent &intent); + + /** + * @brief Add a custom dimension extractor. + * Extractors are called in the order they are added. + */ + void addExtractor(std::unique_ptr extractor); + + /** + * @brief Get the list of extractor names. + */ + QStringList extractorNames() const; + +private: + void initDefaultExtractors(); + + SemanticRuleEngine *m_engine; + std::vector m_extractors; + std::vector> m_extractorOwners; +}; + +DFM_SEARCH_END_NS + +#endif // INTENTPARSER_H diff --git a/src/dfm-search/dfm-search-lib/semantic/ruleconfigloader.cpp b/src/dfm-search/dfm-search-lib/semantic/ruleconfigloader.cpp new file mode 100644 index 00000000..46a4e656 --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/ruleconfigloader.cpp @@ -0,0 +1,168 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "ruleconfigloader.h" +#include "semanticruleengine.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +DFM_SEARCH_BEGIN_NS + +namespace { +#ifdef CMAKE_INSTALL_PREFIX +constexpr auto kInstallPrefix = CMAKE_INSTALL_PREFIX; +#else +constexpr auto kInstallPrefix = "/usr"; +#endif + +#ifdef DFM_SEARCH_LIB_NAME +constexpr auto kLibName = DFM_SEARCH_LIB_NAME; +#else +constexpr auto kLibName = "dfm6-search"; +#endif +} // namespace + +QString RuleConfigLoader::libName() +{ + return QLatin1String(kLibName); +} + +QString RuleConfigLoader::systemRulesDir() +{ + return QDir(QDir(QLatin1String(kInstallPrefix)) + .absoluteFilePath(QLatin1String("share/deepin/") + + libName() + + "/semantic/rules")) + .absolutePath(); +} + +QString RuleConfigLoader::userRulesDir() +{ + return QDir(QStandardPaths::writableLocation(QStandardPaths::GenericConfigLocation) + + "/deepin/" + + libName() + + "/semantic/rules") + .absolutePath(); +} + +QString RuleConfigLoader::currentLocaleName() +{ + return QLocale::system().name().simplified(); +} + +QStringList RuleConfigLoader::ruleFileNames() +{ + return {"noise_rules.json", + "time_rules.json", + "filetype_rules.json", + "keyword_rules.json"}; +} + +QString RuleConfigLoader::resolveLocaleDir(const QString &baseDir) +{ + const QString locale = currentLocaleName(); + + // Try full locale name (e.g., zh_CN) + const QString fullLocalePath = QDir(baseDir).absoluteFilePath(locale); + if (QDir(fullLocalePath).exists()) { + return fullLocalePath; + } + + // Try language only (e.g., zh from zh_CN) + const QString langOnly = locale.split(QLatin1Char('_')).value(0); + if (!langOnly.isEmpty() && langOnly != locale) { + const QString langOnlyPath = QDir(baseDir).absoluteFilePath(langOnly); + if (QDir(langOnlyPath).exists()) { + return langOnlyPath; + } + } + + // Fallback to default locale + return QDir(baseDir).absoluteFilePath(QLatin1String(kDefaultLocale)); +} + +QString RuleConfigLoader::resolveRulePath(const QString &filename) +{ + // User-local override with locale + const QString userLocaleDir = resolveLocaleDir(userRulesDir()); + const QString userPath = QDir(userLocaleDir).absoluteFilePath(filename); + if (QFile::exists(userPath) && validateRuleFile(userPath)) { + return userPath; + } + + // System rules with locale + const QString sysLocaleDir = resolveLocaleDir(systemRulesDir()); + const QString sysPath = QDir(sysLocaleDir).absoluteFilePath(filename); + if (QFile::exists(sysPath) && validateRuleFile(sysPath)) { + return sysPath; + } + + return {}; +} + +bool RuleConfigLoader::loadRuleFile(const QString &path, QList &groups) +{ + QFile file(path); + if (!file.open(QIODevice::ReadOnly)) { + qWarning() << "Failed to open rule file:" << path; + return false; + } + + QJsonParseError parseError; + const QJsonDocument doc = QJsonDocument::fromJson(file.readAll(), &parseError); + if (parseError.error != QJsonParseError::NoError) { + qWarning() << "JSON parse error in" << path << ":" << parseError.errorString(); + return false; + } + + const QJsonObject root = doc.object(); + const QJsonArray groupsArray = root.value("groups").toArray(); + + for (const QJsonValue &gv : groupsArray) { + RuleGroup group; + if (!SemanticRuleEngine::parseRuleGroupStatic(gv.toObject(), group)) { + qWarning() << "Invalid rule group in" << path; + continue; + } + groups.append(group); + } + + return !groups.isEmpty(); +} + +bool RuleConfigLoader::validateRuleFile(const QString &path) +{ + QFile file(path); + if (!file.open(QIODevice::ReadOnly)) { + return false; + } + + QJsonParseError parseError; + const QJsonDocument doc = QJsonDocument::fromJson(file.readAll(), &parseError); + if (parseError.error != QJsonParseError::NoError) { + return false; + } + + const QJsonObject root = doc.object(); + return root.contains("groups") && root.value("groups").isArray(); +} + +bool RuleConfigLoader::ensureUserRulesDir() +{ + const QString dir = userRulesDir(); + if (!QDir().mkpath(dir)) { + qWarning() << "Failed to create user rules directory:" << dir; + return false; + } + return true; +} + +DFM_SEARCH_END_NS diff --git a/src/dfm-search/dfm-search-lib/semantic/ruleconfigloader.h b/src/dfm-search/dfm-search-lib/semantic/ruleconfigloader.h new file mode 100644 index 00000000..34a6469c --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/ruleconfigloader.h @@ -0,0 +1,101 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef RULECONFIGLOADER_H +#define RULECONFIGLOADER_H + +#include + +#include + +DFM_SEARCH_BEGIN_NS + +struct RuleGroup; + +/** + * @brief Loads semantic rule configuration files from system/user paths. + * + * Locale resolution: rules are organized under locale subdirectories + * (e.g., rules/zh_CN/, rules/en_US/). The loader resolves the locale + * at runtime using QLocale and falls back through a chain: + * zh_CN -> zh -> zh_CN (default) + * + * Priority: user-local config > system-installed config. + * System path: /usr/share/deepin//semantic/rules/ + * User path: ~/.config/deepin//semantic/rules/ + */ +class RuleConfigLoader +{ +public: + /** + * @brief Get the library name based on Qt version. + */ + static QString libName(); + + /** + * @brief Get the system-installed rules directory. + */ + static QString systemRulesDir(); + + /** + * @brief Get the user-local rules directory for overrides. + */ + static QString userRulesDir(); + + /** + * @brief Get the current locale name (e.g., "zh_CN"). + * Uses QLocale::system().name().simplified(). + */ + static QString currentLocaleName(); + + /** + * @brief Get the list of expected rule file names. + */ + static QStringList ruleFileNames(); + + /** + * @brief Resolve the effective path for a rule file. + * Checks user dir first, then system dir, with locale subdirectory lookup. + * Falls back to zh_CN if the current locale directory is not found. + * @param filename The rule file name (e.g., "time_rules.json") + * @return The resolved absolute path, or empty if not found + */ + static QString resolveRulePath(const QString &filename); + + /** + * @brief Load and parse a rule file into groups. + * @param path Absolute path to the JSON file + * @param groups Output: parsed rule groups + * @return true if file was loaded and valid + */ + static bool loadRuleFile(const QString &path, QList &groups); + + /** + * @brief Validate JSON structure of a rule file. + * @param path Absolute path to the JSON file + * @return true if valid + */ + static bool validateRuleFile(const QString &path); + + /** + * @brief Ensure user rules directory exists. + * @return true on success + */ + static bool ensureUserRulesDir(); + +private: + /** + * @brief Get the locale subdirectory name with fallback chain. + * Tries: full locale (zh_CN) -> language only (zh) -> default (zh_CN) + * @param baseDir The base rules directory + * @return The locale subdirectory path that exists, or baseDir/defaultLocale + */ + static QString resolveLocaleDir(const QString &baseDir); + + static constexpr const char *kDefaultLocale = "zh_CN"; +}; + +DFM_SEARCH_END_NS + +#endif // RULECONFIGLOADER_H diff --git a/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/filetype_rules.json b/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/filetype_rules.json new file mode 100644 index 00000000..967da828 --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/filetype_rules.json @@ -0,0 +1,154 @@ +{ + "version": "1.0.0", + "groups": [ + { + "name": "filetype", + "version": "1.0.0", + "locale": "zh-CN", + "rules": [ + { + "id": "filetype_word", + "pattern": "word|doc|docx", + "description": "Word documents", + "enabled": true, + "priority": 200, + "metadata": { + "extensions": ["doc", "docx"], + "fileTypes": ["doc"] + } + }, + { + "id": "filetype_pdf", + "pattern": "pdf", + "description": "PDF documents", + "enabled": true, + "priority": 200, + "metadata": { + "extensions": ["pdf"] + } + }, + { + "id": "filetype_excel", + "pattern": "excel|xls|xlsx", + "description": "Excel spreadsheets", + "enabled": true, + "priority": 200, + "metadata": { + "extensions": ["xls", "xlsx"], + "fileTypes": ["doc"] + } + }, + { + "id": "filetype_ppt", + "pattern": "ppt|pptx", + "description": "PowerPoint presentations", + "enabled": true, + "priority": 200, + "metadata": { + "extensions": ["ppt", "pptx"] + } + }, + { + "id": "filetype_document_general", + "pattern": "文档|文件|报告|文章|方案|文本|资料|笔记|稿件", + "description": "Generic documents (fallback)", + "enabled": true, + "priority": 100, + "metadata": { + "extensions": ["doc", "docx", "pdf", "txt", "wps", "rtf", "md", "odt"], + "fileTypes": ["doc"], + "general": true + } + }, + { + "id": "filetype_spreadsheet_general", + "pattern": "表格|统计表|报表|名单|数据表|明细", + "description": "Generic spreadsheets (fallback)", + "enabled": true, + "priority": 100, + "metadata": { + "extensions": ["xls", "xlsx", "csv", "ods", "et"], + "fileTypes": ["doc"], + "general": true + } + }, + { + "id": "filetype_presentation_general", + "pattern": "幻灯片|演示文稿|汇报|课件|宣讲", + "description": "Generic presentations (fallback)", + "enabled": true, + "priority": 100, + "metadata": { + "extensions": ["ppt", "pptx", "dps", "odp"], + "general": true + } + }, + { + "id": "filetype_image", + "pattern": "图片|照片|截图|壁纸|海报|相片|表情包|图", + "description": "Images", + "enabled": true, + "priority": 150, + "metadata": { + "extensions": ["jpg", "jpeg", "png", "gif", "bmp", "webp", "svg"], + "fileTypes": ["pic"] + } + }, + { + "id": "filetype_video", + "pattern": "视频|录像|电影|动画|短片|片子", + "description": "Videos", + "enabled": true, + "priority": 150, + "metadata": { + "extensions": ["mp4", "avi", "mkv", "mov", "flv", "wmv", "webm"], + "fileTypes": ["video"] + } + }, + { + "id": "filetype_audio", + "pattern": "音频|音乐|录音|歌|语音", + "description": "Audio files", + "enabled": true, + "priority": 150, + "metadata": { + "extensions": ["mp3", "wav", "flac", "aac", "ogg", "m4a"], + "fileTypes": ["audio"] + } + }, + { + "id": "filetype_archive", + "pattern": "压缩包|归档|源码包|打包文件|zip|rar", + "description": "Archive files", + "enabled": true, + "priority": 150, + "metadata": { + "extensions": ["zip", "tar.gz", "tar", "rar", "7z", "bz2"], + "fileTypes": ["archive"] + } + }, + { + "id": "filetype_application", + "pattern": "安装包|软件|应用|脚本|程序", + "description": "Application packages", + "enabled": true, + "priority": 150, + "metadata": { + "extensions": ["deb", "AppImage", "sh", "py", "bin", "run"], + "fileTypes": ["app"] + } + }, + { + "id": "filetype_design_source", + "pattern": "源文件|设计稿|矢量图|工程文件|psd|fig|sketch", + "description": "Design source files", + "enabled": true, + "priority": 150, + "metadata": { + "extensions": ["psd", "ai", "fig", "sketch"] + } + } + ] + } + ] +} diff --git a/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/keyword_rules.json b/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/keyword_rules.json new file mode 100644 index 00000000..7fb9ae64 --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/keyword_rules.json @@ -0,0 +1,51 @@ +{ + "version": "1.0.0", + "groups": [ + { + "name": "keyword", + "version": "1.0.0", + "locale": "zh-CN", + "rules": [ + { + "id": "keyword_contains", + "pattern": "包含(.+?)(?:的|$)", + "description": "Contains keyword pattern", + "enabled": true, + "priority": 200, + "metadata": { + "capture_group": 1, + "multi_keyword": true, + "cleanup_pattern": "[\\s。、\u201c\u201d\uff0c\u3001\u7684\u4e86\u8981\u554a]+", + "split_pattern": "[\u548c\u4e0e\u4ee5\u53ca\u6216\u3001]+" + } + }, + { + "id": "keyword_named", + "pattern": "名为(.+?)(?:的|$)|叫做(.+?)(?:的|$)", + "description": "Named keyword pattern", + "enabled": true, + "priority": 200, + "metadata": { + "capture_group": 1, + "multi_keyword": false, + "cleanup_pattern": "[\\s。、\u201c\u201d\uff0c\u3001\u7684\u4e86\u8981\u554a]+", + "split_pattern": "[\u548c\u4e0e\u4ee5\u53ca\u6216\u3001]+" + } + }, + { + "id": "keyword_content_has", + "pattern": "内容(?:包含|含有|带有)(.+?)(?:的|$)", + "description": "Content contains keyword pattern", + "enabled": true, + "priority": 200, + "metadata": { + "capture_group": 1, + "multi_keyword": true, + "cleanup_pattern": "[\\s。、\u201c\u201d\uff0c\u3001\u7684\u4e86\u8981\u554a]+", + "split_pattern": "[\u548c\u4e0e\u4ee5\u53ca\u6216\u3001]+" + } + } + ] + } + ] +} diff --git a/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/noise_rules.json b/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/noise_rules.json new file mode 100644 index 00000000..d247f052 --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/noise_rules.json @@ -0,0 +1,44 @@ +{ + "version": "1.0.0", + "groups": [ + { + "name": "noise", + "version": "1.0.0", + "locale": "zh-CN", + "rules": [ + { + "id": "noise_action", + "pattern": "帮我找到|搜索|查找|找一下|搜一下|查一下|帮我搜|帮我找", + "description": "Search action words to consume", + "enabled": true, + "priority": 100, + "metadata": {} + }, + { + "id": "noise_suffix", + "pattern": "的文件|的图片|的文档|的视频|的音频|的照片|的音乐|的压缩包|的安装包", + "description": "Trailing suffix words to consume", + "enabled": true, + "priority": 90, + "metadata": {} + }, + { + "id": "noise_polite", + "pattern": "请|麻烦|谢谢|帮我", + "description": "Polite words to consume", + "enabled": true, + "priority": 80, + "metadata": {} + }, + { + "id": "noise_generic", + "pattern": "里面|关于|含有|带有|和|与|以及|或者|或", + "description": "Generic filler words to consume", + "enabled": true, + "priority": 70, + "metadata": {} + } + ] + } + ] +} diff --git a/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/time_rules.json b/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/time_rules.json new file mode 100644 index 00000000..318b2ae1 --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/time_rules.json @@ -0,0 +1,236 @@ +{ + "version": "1.0.0", + "groups": [ + { + "name": "time", + "version": "1.0.0", + "locale": "zh-CN", + "rules": [ + { + "id": "time_today", + "pattern": "今天|今日|今日份", + "description": "Today", + "enabled": true, + "priority": 200, + "metadata": { + "type": "preset", + "preset": "today" + } + }, + { + "id": "time_yesterday", + "pattern": "昨天上午|昨天下午|昨天晚上|昨天|昨日|昨晚", + "description": "Yesterday", + "enabled": true, + "priority": 200, + "metadata": { + "type": "preset", + "preset": "yesterday" + } + }, + { + "id": "time_day_before_yesterday", + "pattern": "前天", + "description": "Day before yesterday", + "enabled": true, + "priority": 200, + "metadata": { + "type": "preset", + "preset": "day_before_yesterday" + } + }, + { + "id": "time_this_week", + "pattern": "本周|这周|这个星期|这一个星期", + "description": "This week", + "enabled": true, + "priority": 190, + "metadata": { + "type": "preset", + "preset": "this_week" + } + }, + { + "id": "time_last_week", + "pattern": "上周|上个星期|上星期|上一个星期", + "description": "Last week", + "enabled": true, + "priority": 190, + "metadata": { + "type": "preset", + "preset": "last_week" + } + }, + { + "id": "time_this_month", + "pattern": "本月|这个月|当月", + "description": "This month", + "enabled": true, + "priority": 180, + "metadata": { + "type": "preset", + "preset": "this_month" + } + }, + { + "id": "time_last_month", + "pattern": "上个月|上月", + "description": "Last month", + "enabled": true, + "priority": 180, + "metadata": { + "type": "preset", + "preset": "last_month" + } + }, + { + "id": "time_this_year", + "pattern": "今年|本年|这年", + "description": "This year", + "enabled": true, + "priority": 170, + "metadata": { + "type": "preset", + "preset": "this_year" + } + }, + { + "id": "time_last_year", + "pattern": "去年|上一年", + "description": "Last year", + "enabled": true, + "priority": 170, + "metadata": { + "type": "preset", + "preset": "last_year" + } + }, + { + "id": "time_exact_year", + "pattern": "(?\\d{2,4})年", + "description": "Exact year (e.g. 2025年, 25年)", + "enabled": true, + "priority": 120, + "metadata": { + "type": "custom", + "format": "year", + "digit_map": { + "零": 0, + "一": 1, + "二": 2, + "三": 3, + "四": 4, + "五": 5, + "六": 6, + "七": 7, + "八": 8, + "九": 9, + "十": 10 + }, + "tens_unit": "十" + } + }, + { + "id": "time_exact_month_current_year", + "pattern": "(?\\d{1,2})月份?", + "description": "Month this year (e.g. 12月, 5月份)", + "enabled": true, + "priority": 120, + "metadata": { + "type": "custom", + "format": "month", + "digit_map": { + "零": 0, + "一": 1, + "二": 2, + "三": 3, + "四": 4, + "五": 5, + "六": 6, + "七": 7, + "八": 8, + "九": 9, + "十": 10 + }, + "tens_unit": "十" + } + }, + { + "id": "time_exact_year_month", + "pattern": "(?\\d{2,4})[年\\./\\-](?\\d{1,2})月?", + "description": "Exact year-month (e.g. 2025年12月, 2025-12)", + "enabled": true, + "priority": 140, + "metadata": { + "type": "custom", + "format": "year_month", + "digit_map": { + "零": 0, + "一": 1, + "二": 2, + "三": 3, + "四": 4, + "五": 5, + "六": 6, + "七": 7, + "八": 8, + "九": 9, + "十": 10 + }, + "tens_unit": "十" + } + }, + { + "id": "time_exact_date_current_year", + "pattern": "(?\\d{1,2})月(?\\d{1,2})[日号]", + "description": "Exact date this year (e.g. 12月5日, 3月8号)", + "enabled": true, + "priority": 140, + "metadata": { + "type": "custom", + "format": "date", + "digit_map": { + "零": 0, + "一": 1, + "二": 2, + "三": 3, + "四": 4, + "五": 5, + "六": 6, + "七": 7, + "八": 8, + "九": 9, + "十": 10 + }, + "tens_unit": "十" + } + }, + { + "id": "time_exact_full_date", + "pattern": "(?\\d{2,4})[年\\./\\-](?\\d{1,2})[月\\./\\-](?\\d{1,2})[日号]?", + "description": "Exact full date (e.g. 2025年12月5日, 2025-12-05)", + "enabled": true, + "priority": 160, + "metadata": { + "type": "custom", + "format": "full_date", + "digit_map": { + "零": 0, + "一": 1, + "二": 2, + "三": 3, + "四": 4, + "五": 5, + "六": 6, + "七": 7, + "八": 8, + "九": 9, + "十": 10 + }, + "tens_unit": "十" + } + } + ] + } + ] +} \ No newline at end of file diff --git a/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.cpp b/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.cpp new file mode 100644 index 00000000..44a1f477 --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.cpp @@ -0,0 +1,187 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "semanticquerybuilder.h" + +#include +#include +#include +#include +#include + +DFM_SEARCH_BEGIN_NS + +SemanticQueryBuilder::SemanticQueryBuilder() = default; +SemanticQueryBuilder::~SemanticQueryBuilder() = default; + +SemanticSearchPlan SemanticQueryBuilder::build(const ParsedIntent &intent) +{ + SemanticSearchPlan plan; + + // Base options shared across all search paths + SearchOptions baseOpts = buildBaseOptions(intent.timeConstraint); + baseOpts.setSearchMethod(SearchMethod::Indexed); + + // --- File name search (always enabled) --- + { + SearchOptions opts = baseOpts; + FileNameOptionsAPI fnameApi(opts); + fnameApi.setPinyinEnabled(true); + fnameApi.setPinyinAcronymEnabled(true); + + if (!intent.fileExtensions.isEmpty()) { + fnameApi.setFileExtensions(intent.fileExtensions); + } + + if (intent.keywords.size() == 1) { + plan.fileNameQuery = SearchFactory::createQuery(intent.keywords.first()); + } else if (intent.keywords.size() > 1) { + plan.fileNameQuery = SearchFactory::createQuery(intent.keywords, SearchQuery::Type::Boolean); + } else { + // No keywords: search all files (use wildcard to match everything) + plan.fileNameQuery = SearchFactory::createQuery(QStringLiteral("*")); + } + + plan.fileNameOptions = opts; + } + + // --- Content search (when keywords available) --- + { + const bool hasKeywords = !intent.keywords.isEmpty(); + bool contentEnabled = hasKeywords; + + // Check if content index is available + if (contentEnabled && !Global::isContentIndexAvailable()) { + contentEnabled = false; + } + + if (contentEnabled) { + // Check minimum keyword length + const int minLen = Global::kMinContentSearchKeywordLength; + bool hasValidKeyword = false; + for (const QString &kw : intent.keywords) { + if (kw.length() >= minLen) { + hasValidKeyword = true; + break; + } + } + if (!hasValidKeyword) { + contentEnabled = false; + } + } + + if (contentEnabled) { + SearchOptions opts = baseOpts; + ContentOptionsAPI contentApi(opts); + + // Enable filename-content mixed AND search + contentApi.setFilenameContentMixedAndSearchEnabled(true); + + if (intent.keywords.size() == 1) { + plan.contentQuery = SearchFactory::createQuery(intent.keywords.first()); + } else if (intent.keywords.size() > 1) { + plan.contentQuery = SearchFactory::createQuery(intent.keywords, SearchQuery::Type::Boolean); + } + + plan.contentOptions = opts; + } + } + + // --- OCR search (when keywords available) --- + { + const bool hasKeywords = !intent.keywords.isEmpty(); + bool ocrEnabled = hasKeywords; + + if (ocrEnabled && !Global::isOcrTextIndexAvailable()) { + ocrEnabled = false; + } + + if (ocrEnabled) { + SearchOptions opts = baseOpts; + OcrTextOptionsAPI ocrApi(opts); + + // Enable filename-OCR content mixed AND search + ocrApi.setFilenameOcrContentMixedAndSearchEnabled(true); + + if (intent.keywords.size() == 1) { + plan.ocrQuery = SearchFactory::createQuery(intent.keywords.first()); + } else if (intent.keywords.size() > 1) { + plan.ocrQuery = SearchFactory::createQuery(intent.keywords, SearchQuery::Type::Boolean); + } + + plan.ocrOptions = opts; + } + } + + return plan; +} + +TimeRangeFilter SemanticQueryBuilder::buildTimeRangeFilter(const TimeConstraint &tc) const +{ + TimeRangeFilter filter; + + if (!tc.isValid()) { + return filter; + } + + switch (tc.kind) { + case TimeConstraintKind::Preset: + switch (tc.preset) { + case TimePreset::Today: + filter.setToday(); + break; + case TimePreset::Yesterday: + filter.setYesterday(); + break; + case TimePreset::DayBeforeYesterday: { + const QDate today = QDate::currentDate(); + const QDate dayBefore = today.addDays(-2); + filter.setRange(QDateTime(dayBefore, QTime(0, 0, 0)), + QDateTime(dayBefore, QTime(23, 59, 59))); + break; + } + case TimePreset::ThisWeek: + filter.setThisWeek(); + break; + case TimePreset::LastWeek: + filter.setLastWeek(); + break; + case TimePreset::ThisMonth: + filter.setThisMonth(); + break; + case TimePreset::LastMonth: + filter.setLastMonth(); + break; + case TimePreset::ThisYear: + filter.setThisYear(); + break; + case TimePreset::LastYear: + filter.setLastYear(); + break; + } + break; + case TimeConstraintKind::Relative: + filter.setLast(tc.relativeValue, tc.relativeUnit); + break; + case TimeConstraintKind::Custom: + filter.setRange(tc.customStart, tc.customEnd); + break; + case TimeConstraintKind::None: + break; + } + + return filter; +} + +SearchOptions SemanticQueryBuilder::buildBaseOptions(const TimeConstraint &tc) const +{ + SearchOptions opts; + const TimeRangeFilter timeFilter = buildTimeRangeFilter(tc); + if (timeFilter.isValid()) { + opts.setTimeRangeFilter(timeFilter); + } + return opts; +} + +DFM_SEARCH_END_NS diff --git a/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.h b/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.h new file mode 100644 index 00000000..2f8c2136 --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.h @@ -0,0 +1,52 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef SEMANTICQUERYBUILDER_H +#define SEMANTICQUERYBUILDER_H + +#include +#include +#include +#include + +#include + +DFM_SEARCH_BEGIN_NS + +/** + * @brief Search plan for all three search paths. + */ +struct SemanticSearchPlan { + SearchQuery fileNameQuery; + SearchOptions fileNameOptions; + std::optional contentQuery; + std::optional contentOptions; + std::optional ocrQuery; + std::optional ocrOptions; +}; + +/** + * @brief Converts a ParsedIntent into concrete SearchQuery + SearchOptions for each path. + */ +class SemanticQueryBuilder +{ +public: + SemanticQueryBuilder(); + ~SemanticQueryBuilder(); + + /** + * @brief Build a search plan from parsed intent. + * @param intent The parsed intent + * @return A search plan with queries and options for each search path + */ + SemanticSearchPlan build(const ParsedIntent &intent); + +private: + TimeRangeFilter buildTimeRangeFilter(const TimeConstraint &tc) const; + SearchOptions buildBaseOptions(const TimeConstraint &tc) const; +}; + +DFM_SEARCH_END_NS + +#endif // SEMANTICQUERYBUILDER_H diff --git a/src/dfm-search/dfm-search-lib/semantic/semanticruleengine.cpp b/src/dfm-search/dfm-search-lib/semantic/semanticruleengine.cpp new file mode 100644 index 00000000..1db5fa30 --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/semanticruleengine.cpp @@ -0,0 +1,285 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "semanticruleengine.h" +#include "ruleconfigloader.h" + +#include +#include +#include +#include + +DFM_SEARCH_BEGIN_NS + +SemanticRuleEngine::SemanticRuleEngine(QObject *parent) + : QObject(parent) + , m_watcher(new QFileSystemWatcher(this)) + , m_reloadTimer(new QTimer(this)) +{ + m_reloadTimer->setSingleShot(true); + m_reloadTimer->setInterval(100); + + QObject::connect(m_reloadTimer, &QTimer::timeout, this, [this]() { + loadRules(); + Q_EMIT rulesReloaded(); + }); + + QObject::connect(m_watcher, &QFileSystemWatcher::fileChanged, + this, [this](const QString &) { + m_reloadTimer->start(); + }); +} + +SemanticRuleEngine::~SemanticRuleEngine() = default; + +bool SemanticRuleEngine::loadRules() +{ + QMap newGroups; + QStringList watchedFiles; + + for (const QString &filename : RuleConfigLoader::ruleFileNames()) { + const QString path = RuleConfigLoader::resolveRulePath(filename); + if (path.isEmpty()) { + qWarning() << "Rule file not found:" << filename; + continue; + } + + QList loaded; + if (!RuleConfigLoader::loadRuleFile(path, loaded)) { + qWarning() << "Failed to load rule file:" << path; + continue; + } + + for (RuleGroup &group : loaded) { + if (newGroups.contains(group.name)) { + // Merge: later rules override by ID + for (const Rule &rule : group.rules) { + auto &existingRules = newGroups[group.name].rules; + bool replaced = false; + for (int i = 0; i < existingRules.size(); ++i) { + if (existingRules[i].id == rule.id) { + existingRules[i] = rule; + replaced = true; + break; + } + } + if (!replaced) { + existingRules.append(rule); + } + } + } else { + newGroups.insert(group.name, std::move(group)); + } + + m_ruleFilePaths.insert(group.name, path); + if (!watchedFiles.contains(path)) { + watchedFiles.append(path); + } + } + } + + if (newGroups.isEmpty()) { + qWarning() << "No rule files loaded, keeping cached rules"; + return !m_groups.isEmpty(); + } + + // Cache valid rules for rollback + m_cachedGroups = m_groups.isEmpty() ? newGroups : m_groups; + m_groups = newGroups; + + // Update file watcher + if (!m_watcher->files().isEmpty()) { + m_watcher->removePaths(m_watcher->files()); + } + for (const QString &f : watchedFiles) { + m_watcher->addPath(f); + } + + return true; +} + +bool SemanticRuleEngine::loadRuleFile(const QString &path) +{ + QList loaded; + if (!RuleConfigLoader::loadRuleFile(path, loaded)) { + qWarning() << "Failed to load rule file:" << path; + return false; + } + + for (RuleGroup &group : loaded) { + if (m_groups.contains(group.name)) { + // Merge: later rules override by ID + for (const Rule &rule : group.rules) { + auto &existingRules = m_groups[group.name].rules; + bool replaced = false; + for (int i = 0; i < existingRules.size(); ++i) { + if (existingRules[i].id == rule.id) { + existingRules[i] = rule; + replaced = true; + break; + } + } + if (!replaced) { + existingRules.append(rule); + } + } + } else { + m_groups.insert(group.name, std::move(group)); + } + + m_ruleFilePaths.insert(group.name, path); + } + + return true; +} + +bool SemanticRuleEngine::match(const QString &group, const QString &input, QRegularExpressionMatch &outMatch, + QString *outRuleId) +{ + if (!m_groups.contains(group)) { + return false; + } + + const QList &rules = m_groups.value(group).rules; + QList sorted = rules; + std::stable_sort(sorted.begin(), sorted.end(), + [](const Rule &a, const Rule &b) { return a.priority > b.priority; }); + + for (const Rule &rule : sorted) { + if (!rule.enabled || !rule.regex.isValid()) { + continue; + } + QRegularExpressionMatch m = rule.regex.match(input); + if (m.hasMatch()) { + outMatch = m; + if (outRuleId) { + *outRuleId = rule.id; + } + return true; + } + } + + return false; +} + +QList SemanticRuleEngine::matchAll(const QString &group, const QString &input, + QStringList *outRuleIds) +{ + QList results; + + if (!m_groups.contains(group)) { + return results; + } + + const QList &rules = m_groups.value(group).rules; + QList sorted = rules; + std::stable_sort(sorted.begin(), sorted.end(), + [](const Rule &a, const Rule &b) { return a.priority > b.priority; }); + + for (const Rule &rule : sorted) { + if (!rule.enabled || !rule.regex.isValid()) { + continue; + } + + // Use globalMatch to find ALL occurrences of this rule's pattern. + // This is important for noise rules (e.g., "和" appearing multiple times). + auto it = rule.regex.globalMatch(input); + while (it.hasNext()) { + QRegularExpressionMatch m = it.next(); + if (m.hasMatch()) { + results.append(m); + if (outRuleIds) { + outRuleIds->append(rule.id); + } + } + } + } + + return results; +} + +QVariantMap SemanticRuleEngine::ruleMetadata(const QString &group, const QString &ruleId) const +{ + if (!m_groups.contains(group)) { + return {}; + } + + for (const Rule &rule : m_groups.value(group).rules) { + if (rule.id == ruleId) { + return rule.metadata; + } + } + return {}; +} + +bool SemanticRuleEngine::hasGroup(const QString &group) const +{ + return m_groups.contains(group); +} + +QStringList SemanticRuleEngine::ruleIds(const QString &group) const +{ + const auto it = m_groups.constFind(group); + if (it == m_groups.constEnd()) { + return {}; + } + + QStringList ids; + for (const Rule &rule : it->rules) { + ids.append(rule.id); + } + return ids; +} + +QStringList SemanticRuleEngine::groupNames() const +{ + return m_groups.keys(); +} + +bool SemanticRuleEngine::parseRuleGroupStatic(const QJsonObject &groupObj, RuleGroup &outGroup) +{ + if (!groupObj.contains("name") || !groupObj.contains("rules")) { + return false; + } + + outGroup.name = groupObj.value("name").toString(); + outGroup.version = groupObj.value("version").toString("1.0.0"); + outGroup.locale = groupObj.value("locale").toString(); + + const QJsonArray rulesArray = groupObj.value("rules").toArray(); + for (const QJsonValue &rv : rulesArray) { + const QJsonObject ruleObj = rv.toObject(); + + Rule rule; + rule.id = ruleObj.value("id").toString(); + rule.pattern = ruleObj.value("pattern").toString(); + rule.description = ruleObj.value("description").toString(); + rule.enabled = ruleObj.value("enabled").toBool(true); + rule.priority = ruleObj.value("priority").toInt(0); + rule.metadata = ruleObj.value("metadata").toVariant().toMap(); + + if (rule.pattern.isEmpty() || rule.id.isEmpty()) { + continue; + } + + rule.regex.setPattern(rule.pattern); + rule.regex.setPatternOptions(QRegularExpression::CaseInsensitiveOption); + if (!rule.regex.isValid()) { + qWarning() << "Invalid regex for rule" << rule.id << ":" << rule.regex.errorString(); + continue; + } + + outGroup.rules.append(rule); + } + + return !outGroup.rules.isEmpty(); +} + +void SemanticRuleEngine::onRuleFilesChanged(const QStringList &files) +{ + Q_UNUSED(files); + m_reloadTimer->start(); +} + +DFM_SEARCH_END_NS diff --git a/src/dfm-search/dfm-search-lib/semantic/semanticruleengine.h b/src/dfm-search/dfm-search-lib/semantic/semanticruleengine.h new file mode 100644 index 00000000..ecf95a3e --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/semanticruleengine.h @@ -0,0 +1,131 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef SEMANTICRULEENGINE_H +#define SEMANTICRULEENGINE_H + +#include +#include +#include +#include +#include +#include +#include + +#include + +DFM_SEARCH_BEGIN_NS + +struct Rule { + QString id; + QString pattern; + QString description; + bool enabled = true; + int priority = 0; + QVariantMap metadata; + QRegularExpression regex; +}; + +struct RuleGroup { + QString name; + QString version; + QString locale; + QList rules; +}; + +/** + * @brief Rule engine that loads regex rules from JSON config files. + * + * Provides match/matchAll operations with priority-based ordering. + * Supports hot-reload via QFileSystemWatcher. + */ +class SemanticRuleEngine : public QObject +{ + Q_OBJECT + +public: + explicit SemanticRuleEngine(QObject *parent = nullptr); + ~SemanticRuleEngine() override; + + /** + * @brief Load rules from all rule files in the config directory. + * @return true if at least one valid rule file was loaded. + */ + bool loadRules(); + + /** + * @brief Load rules from a specific rule file. + * Useful for testing or loading custom rule files. + * Merges with any previously loaded rules by group name. + * @param path Absolute path to a JSON rule file + * @return true if the file was loaded successfully + */ + bool loadRuleFile(const QString &path); + + /** + * @brief Find the highest-priority matching rule in a group. + * @param group The rule group name + * @param input The input text to match against + * @param outMatch Output: the regex match result + * @param outRuleId Output: the matched rule's ID (optional) + * @return true if a match was found + */ + bool match(const QString &group, const QString &input, QRegularExpressionMatch &outMatch, + QString *outRuleId = nullptr); + + /** + * @brief Find all matching rules in a group (priority order). + * @param group The rule group name + * @param input The input text to match against + * @param outRuleIds Output: matched rule IDs (optional, parallel to result list) + * @return List of all matches + */ + QList matchAll(const QString &group, const QString &input, + QStringList *outRuleIds = nullptr); + + /** + * @brief Get a rule's metadata by group and rule ID. + */ + QVariantMap ruleMetadata(const QString &group, const QString &ruleId) const; + + /** + * @brief Get all rule IDs in a group. + */ + QStringList ruleIds(const QString &group) const; + + /** + * @brief Check if a rule group exists and has enabled rules. + */ + bool hasGroup(const QString &group) const; + + /** + * @brief Get the list of loaded rule group names. + */ + QStringList groupNames() const; + + /** + * @brief Static helper to parse a rule group from JSON. + */ + static bool parseRuleGroupStatic(const QJsonObject &groupObj, RuleGroup &outGroup); + +Q_SIGNALS: + void rulesReloaded(); + +private Q_SLOTS: + void onRuleFilesChanged(const QStringList &files); + +private: + bool parseRuleGroup(const QJsonObject &groupObj, RuleGroup &outGroup); + void startWatching(); + + QMap m_groups; + QMap m_cachedGroups; // last valid rules for rollback + QMap m_ruleFilePaths; // group name -> resolved file path + QFileSystemWatcher *m_watcher = nullptr; + QTimer *m_reloadTimer = nullptr; +}; + +DFM_SEARCH_END_NS + +#endif // SEMANTICRULEENGINE_H diff --git a/src/dfm-search/dfm-search-lib/semantic/semanticsearcher.cpp b/src/dfm-search/dfm-search-lib/semantic/semanticsearcher.cpp new file mode 100644 index 00000000..72518288 --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/semanticsearcher.cpp @@ -0,0 +1,296 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#include +#include "semanticsearcher_p.h" +#include "semanticquerybuilder.h" +#include "intentparser.h" +#include "semanticruleengine.h" + +#include + +#include +#include +#include + +DFM_SEARCH_BEGIN_NS + +SemanticSearcherData::SemanticSearcherData(SemanticSearcher *q_ptr) + : q(q_ptr) + , ruleEngine(new SemanticRuleEngine(q)) + , intentParser(new IntentParser(ruleEngine)) + , queryBuilder(new SemanticQueryBuilder()) + , timeoutTimer(new QTimer(q)) +{ + timeoutTimer->setSingleShot(true); + timeoutTimer->setInterval(timeoutSeconds * 1000); + + QObject::connect(timeoutTimer, &QTimer::timeout, q, [this]() { + qWarning() << "Semantic search timed out after" << timeoutSeconds << "seconds"; + doCancel(); + }); + + // Load rules + if (!ruleEngine->loadRules()) { + qWarning() << "Failed to load semantic rules"; + } +} + +SemanticSearcherData::~SemanticSearcherData() +{ + doCancel(); +} + +void SemanticSearcherData::doSearch(const QString &naturalLanguage) +{ + if (naturalLanguage.trimmed().isEmpty()) { + Q_EMIT q->errorOccurred(SearchError(SearchErrorCode::InvalidQuery)); + return; + } + + cancelled.store(false); + allResults.clear(); + seenPaths.clear(); + status.store(SearchStatus::Searching); + Q_EMIT q->statusChanged(SearchStatus::Searching); + Q_EMIT q->searchStarted(); + + // Step 1: Parse natural language into intent + ParsedIntent intent; + intentParser->parse(naturalLanguage, intent); + + // Step 2: Build search plan + const SemanticSearchPlan plan = queryBuilder->build(intent); + + // Step 3: Create and launch search engines in parallel + auto onResultsFound = [this](const SearchResultList &results) { + SearchResultList newResults; + for (const SearchResult &r : results) { + if (!seenPaths.contains(r.path())) { + seenPaths.insert(r.path()); + newResults.append(r); + } + } + if (!newResults.isEmpty()) { + allResults.append(newResults); + Q_EMIT q->resultsFound(newResults); + } + }; + + auto onFinished = [this](const SearchResultList &) { + if (pendingFinishCount.fetch_sub(1) == 1) { + // All engines finished + timeoutTimer->stop(); + if (cancelled.load()) { + status.store(SearchStatus::Cancelled); + Q_EMIT q->statusChanged(SearchStatus::Cancelled); + Q_EMIT q->searchCancelled(); + } else { + status.store(SearchStatus::Finished); + Q_EMIT q->statusChanged(SearchStatus::Finished); + Q_EMIT q->searchFinished(allResults); + } + } + }; + + auto onError = [this](const SearchError &error) { + qWarning() << "Search error:" << error.message(); + // Don't propagate individual engine errors to caller + // The other engines may still produce valid results + }; + + // Count how many engines we'll launch + pendingFinishCount.store(0); + + // Clean up any previous search engines (they have parent q, so Qt deletes them) + if (fileNameEngine) { + fileNameEngine->deleteLater(); + fileNameEngine = nullptr; + } + if (contentEngine) { + contentEngine->deleteLater(); + contentEngine = nullptr; + } + if (ocrEngine) { + ocrEngine->deleteLater(); + ocrEngine = nullptr; + } + + // File name search (always) + if (Global::isFileNameIndexReadyForSearch()) { + fileNameEngine = SearchEngine::create(SearchType::FileName, q); + fileNameEngine->setSearchOptions(plan.fileNameOptions); + + QObject::connect(fileNameEngine, &SearchEngine::resultsFound, q, onResultsFound); + QObject::connect(fileNameEngine, &SearchEngine::searchFinished, q, onFinished); + QObject::connect(fileNameEngine, &SearchEngine::errorOccurred, q, onError); + + pendingFinishCount.fetch_add(1); + fileNameEngine->search(plan.fileNameQuery); + } + + // Content search + if (plan.contentQuery.has_value() && plan.contentOptions.has_value()) { + contentEngine = SearchEngine::create(SearchType::Content, q); + contentEngine->setSearchOptions(*plan.contentOptions); + + QObject::connect(contentEngine, &SearchEngine::resultsFound, q, onResultsFound); + QObject::connect(contentEngine, &SearchEngine::searchFinished, q, onFinished); + QObject::connect(contentEngine, &SearchEngine::errorOccurred, q, onError); + + pendingFinishCount.fetch_add(1); + contentEngine->search(*plan.contentQuery); + } + + // OCR search + if (plan.ocrQuery.has_value() && plan.ocrOptions.has_value()) { + ocrEngine = SearchEngine::create(SearchType::Ocr, q); + ocrEngine->setSearchOptions(*plan.ocrOptions); + + QObject::connect(ocrEngine, &SearchEngine::resultsFound, q, onResultsFound); + QObject::connect(ocrEngine, &SearchEngine::searchFinished, q, onFinished); + QObject::connect(ocrEngine, &SearchEngine::errorOccurred, q, onError); + + pendingFinishCount.fetch_add(1); + ocrEngine->search(*plan.ocrQuery); + } + + // If no engines were launched (e.g., no indexes available) + if (pendingFinishCount.load() == 0) { + timeoutTimer->stop(); + status.store(SearchStatus::Finished); + Q_EMIT q->statusChanged(SearchStatus::Finished); + Q_EMIT q->searchFinished({}); + } else { + // Start timeout timer + if (timeoutSeconds > 0) { + timeoutTimer->start(); + } + } +} + +void SemanticSearcherData::doCancel() +{ + cancelled.store(true); + timeoutTimer->stop(); + + if (fileNameEngine) { + fileNameEngine->cancel(); + } + if (contentEngine) { + contentEngine->cancel(); + } + if (ocrEngine) { + ocrEngine->cancel(); + } +} + +// --- SemanticSearcher public API --- + +SemanticSearcher::SemanticSearcher(QObject *parent) + : QObject(parent) + , d_ptr(new SemanticSearcherData(this)) +{ +} + +SemanticSearcher::~SemanticSearcher() = default; + +SearchStatus SemanticSearcher::status() const +{ + return d_ptr->status.load(); +} + +void SemanticSearcher::setSearchTimeout(int seconds) +{ + d_ptr->timeoutSeconds = seconds; + d_ptr->timeoutTimer->setInterval(seconds * 1000); +} + +int SemanticSearcher::searchTimeout() const +{ + return d_ptr->timeoutSeconds; +} + +void SemanticSearcher::search(const QString &naturalLanguage) +{ + if (d_ptr->status.load() == SearchStatus::Searching) { + qWarning() << "Search already in progress"; + return; + } + + d_ptr->doSearch(naturalLanguage); +} + +void SemanticSearcher::cancel() +{ + d_ptr->doCancel(); +} + +SearchResultExpected SemanticSearcher::searchSync(const QString &naturalLanguage) +{ + if (d_ptr->status.load() == SearchStatus::Searching) { + qWarning() << "Search already in progress"; + return Dtk::Core::DUnexpected(SearchError(SearchErrorCode::InvalidQuery)); + } + + if (naturalLanguage.trimmed().isEmpty()) { + return Dtk::Core::DUnexpected(SearchError(SearchErrorCode::InvalidQuery)); + } + + SearchResultList results; + bool hasError = false; + SearchError lastError; + bool cancelled = false; + bool done = false; + + QEventLoop eventLoop; + + // Use a shared guard flag so late-arriving signals after eventLoop exits are harmless. + // The internal doSearch timeout mechanism is relied upon for actual cancellation. + QObject::connect(this, &SemanticSearcher::searchFinished, this, + [&](const SearchResultList &r) { + if (!done) { + results = r; + done = true; + eventLoop.quit(); + } + }); + + QObject::connect(this, &SemanticSearcher::searchCancelled, this, + [&]() { + if (!done) { + cancelled = true; + done = true; + eventLoop.quit(); + } + }); + + QObject::connect(this, &SemanticSearcher::errorOccurred, this, + [&](const SearchError &error) { + if (!done) { + hasError = true; + lastError = error; + done = true; + eventLoop.quit(); + } + }); + + // Start the async search (uses internal timeout mechanism) + d_ptr->doSearch(naturalLanguage); + + // Block until completion, cancellation, or error + eventLoop.exec(); + + if (cancelled) { + return results; + } + + if (hasError) { + return Dtk::Core::DUnexpected(lastError); + } + + return results; +} + +DFM_SEARCH_END_NS diff --git a/src/dfm-search/dfm-search-lib/semantic/semanticsearcher_p.h b/src/dfm-search/dfm-search-lib/semantic/semanticsearcher_p.h new file mode 100644 index 00000000..75fd4197 --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/semanticsearcher_p.h @@ -0,0 +1,58 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef SEMANTICSEARCHER_P_H +#define SEMANTICSEARCHER_P_H + +#include +#include + +#include +#include + +DFM_SEARCH_BEGIN_NS + +class SemanticRuleEngine; +class IntentParser; +class SemanticQueryBuilder; +class SemanticSearchPlan; + +class SemanticSearcherData +{ +public: + explicit SemanticSearcherData(SemanticSearcher *q); + ~SemanticSearcherData(); + + void doSearch(const QString &naturalLanguage); + void doCancel(); + + SemanticSearcher *q = nullptr; + + // State + std::atomic status{SearchStatus::Ready}; + std::atomic cancelled{false}; + int timeoutSeconds = 60; + + // Core components (owned) + SemanticRuleEngine *ruleEngine = nullptr; + IntentParser *intentParser = nullptr; + SemanticQueryBuilder *queryBuilder = nullptr; + + // Sub-engines (owned per search) + SearchEngine *fileNameEngine = nullptr; + SearchEngine *contentEngine = nullptr; + SearchEngine *ocrEngine = nullptr; + std::atomic pendingFinishCount{0}; + + // Result collection + SearchResultList allResults; + QSet seenPaths; + + // Timeout + QTimer *timeoutTimer = nullptr; +}; + +DFM_SEARCH_END_NS + +#endif // SEMANTICSEARCHER_P_H From b48ad3e1934adbc1ba5962a8bca9c94276f754c7 Mon Sep 17 00:00:00 2001 From: Zhang Sheng Date: Thu, 14 May 2026 14:20:37 +0800 Subject: [PATCH 03/36] feat: add relative time support for Chinese search MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Added new relative time parsing rules in Chinese NLP with 4 categories: just now (2h), recent days (3d), past few days (3-7d), and a while ago (30+ days) 2. Implemented test cases for relative time queries with detailed time range validation 3. Added support for various synonyms for each time category 4. Implemented priority handling when relative and preset time rules conflict 5. Extended TimeExtractor to properly handle relative time metadata 6. Updated query builder to use custom time ranges for relative time constraints Log: Added support for "just now", "recent days", "past few days" and "a while ago" time ranges in Chinese file search Influence: 1. Test Chinese queries with "刚刚", "最近", "前几天", "之前" and synonyms 2. Verify time ranges match expected periods (2h, 3d, 3-7d, 30d+) 3. Verify priority when combined with preset times like "今天之前" 4. Check file type filtering still works with relative time queries 5. Test edge cases like exact minute boundaries for "just now" feat: 增加中文搜索的相对时间支持 1. 在中文NLP中新增4类相对时间解析规则:刚刚(2小时内)、最近(3天内)、前几 天(3-7天前)、之前(30天前) 2. 实现了相对时间查询的测试用例,包含详细的时间范围验证 3. 为每个时间类别添加了多种同义词支持 4. 实现了当相对时间和预设时间规则冲突时的优先级处理 5. 扩展TimeExtractor以正确处理相对时间元数据 6. 更新查询构建器以使用自定义时间范围处理相对时间约束 Log: 在中文文件搜索中新增对"刚刚"、"最近"、"前几天"和"之前"时间范围的 支持 Influence: 1. 测试包含"刚刚"、"最近"、"前几天"、"之前"及其同义词的中文查询 2. 验证时间范围是否符合预期(2小时、3天、3-7天、30天以上) 3. 验证与预设时间如"今天之前"组合时的优先级 4. 检查文件类型过滤在相对时间查询中是否仍然有效 5. 测试边缘情况,如"刚刚"查询的精确分钟边界 --- .../dfm-search-tests/tst_chinese_nlp.cpp | 116 ++++++++++++++++++ .../semantic/extractors/timeextractor.cpp | 20 +++ .../semantic/extractors/timeextractor.h | 1 + .../semantic/rules/zh_CN/time_rules.json | 52 ++++++++ .../semantic/semanticquerybuilder.cpp | 2 +- 5 files changed, 190 insertions(+), 1 deletion(-) diff --git a/autotests/dfm-search-tests/tst_chinese_nlp.cpp b/autotests/dfm-search-tests/tst_chinese_nlp.cpp index a3b79aa5..b034d101 100644 --- a/autotests/dfm-search-tests/tst_chinese_nlp.cpp +++ b/autotests/dfm-search-tests/tst_chinese_nlp.cpp @@ -4,6 +4,7 @@ #include #include +#include #include #include #include @@ -95,6 +96,17 @@ private Q_SLOTS: void combined_monthAndType(); void combined_yearAndType(); + // Relative time tests + void timeRelative_justNow(); + void timeRelative_justNow_synonyms(); + void timeRelative_recentDays(); + void timeRelative_recentDays_synonyms(); + void timeRelative_pastFewDays(); + void timeRelative_pastFewDays_synonyms(); + void timeRelative_aWhileAgo(); + void timeRelative_aWhileAgo_synonyms(); + void timeRelative_priority_vs_preset(); + // Keyword tests void keyword_contains_single(); void keyword_contains_multi(); @@ -1027,6 +1039,110 @@ void tst_ChineseNLP::combined_yearAndType() QVERIFY(intent.fileExtensions.contains("avi")); } +// ===== Relative Time Tests ===== + +void tst_ChineseNLP::timeRelative_justNow() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("刚刚的图片"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Relative); + // End should be very close to NOW (within 2 seconds) + const qint64 endDelta = qAbs(intent.timeConstraint.customEnd.secsTo(QDateTime::currentDateTime())); + QVERIFY2(endDelta < 2, "Relative end should be close to NOW"); + // Start should be ~2 hours ago + const qint64 startDelta = qAbs(intent.timeConstraint.customStart.secsTo(QDateTime::currentDateTime().addSecs(-7200))); + QVERIFY2(startDelta < 2, "Relative start should be ~2h ago"); + QVERIFY(intent.fileExtensions.contains("jpg")); +} + +void tst_ChineseNLP::timeRelative_justNow_synonyms() +{ + const QStringList inputs = { QStringLiteral("刚才"), QStringLiteral("刚"), + QStringLiteral("这会儿") }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input + QStringLiteral("的文档"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Relative); + } +} + +void tst_ChineseNLP::timeRelative_recentDays() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("最近的图片"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Relative); + // End = NOW, Start = NOW - 3 days + const qint64 endDelta = qAbs(intent.timeConstraint.customEnd.secsTo(QDateTime::currentDateTime())); + QVERIFY2(endDelta < 2, "Recent days end should be NOW"); + const qint64 startDelta = qAbs(intent.timeConstraint.customStart.secsTo(QDateTime::currentDateTime().addSecs(-259200))); + QVERIFY2(startDelta < 2, "Recent days start should be ~3 days ago"); +} + +void tst_ChineseNLP::timeRelative_recentDays_synonyms() +{ + const QStringList inputs = { QStringLiteral("这几天"), QStringLiteral("近期"), + QStringLiteral("这阵子") }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input + QStringLiteral("的文件"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Relative); + } +} + +void tst_ChineseNLP::timeRelative_pastFewDays() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("前几天的文档"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Relative); + // End = NOW - 3 days, Start = NOW - 7 days + const qint64 endDelta = qAbs(intent.timeConstraint.customEnd.secsTo(QDateTime::currentDateTime().addSecs(-259200))); + QVERIFY2(endDelta < 2, "Past few days end should be ~3 days ago"); + const qint64 startDelta = qAbs(intent.timeConstraint.customStart.secsTo(QDateTime::currentDateTime().addSecs(-604800))); + QVERIFY2(startDelta < 2, "Past few days start should be ~7 days ago"); +} + +void tst_ChineseNLP::timeRelative_pastFewDays_synonyms() +{ + const QStringList inputs = { QStringLiteral("之前几天"), QStringLiteral("那些天") }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input + QStringLiteral("的图片"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Relative); + } +} + +void tst_ChineseNLP::timeRelative_aWhileAgo() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("之前的文档"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Relative); + // End = NOW - 30 days + const qint64 endDelta = qAbs(intent.timeConstraint.customEnd.secsTo(QDateTime::currentDateTime().addSecs(-2592000))); + QVERIFY2(endDelta < 2, "A while ago end should be ~30 days ago"); + // Start should be epoch + QCOMPARE(intent.timeConstraint.customStart, QDateTime::fromMSecsSinceEpoch(0)); +} + +void tst_ChineseNLP::timeRelative_aWhileAgo_synonyms() +{ + const QStringList inputs = { QStringLiteral("早些时候"), QStringLiteral("以前") }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input + QStringLiteral("的文件"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Relative); + } +} + +void tst_ChineseNLP::timeRelative_priority_vs_preset() +{ + // When both preset and relative could match, preset should win (higher priority) + // "今天之前" — "今天" matches time_today (priority 200), "之前" matches time_a_while_ago (priority 80) + ParsedIntent intent; + m_parser->parse(QStringLiteral("今天之前的文档"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::Today); +} + QObject *create_tst_ChineseNLP() { return new tst_ChineseNLP(); diff --git a/src/dfm-search/dfm-search-lib/semantic/extractors/timeextractor.cpp b/src/dfm-search/dfm-search-lib/semantic/extractors/timeextractor.cpp index c4f77443..5f04389f 100644 --- a/src/dfm-search/dfm-search-lib/semantic/extractors/timeextractor.cpp +++ b/src/dfm-search/dfm-search-lib/semantic/extractors/timeextractor.cpp @@ -7,6 +7,7 @@ #include "../semanticruleengine.h" #include +#include #include DFM_SEARCH_BEGIN_NS @@ -54,6 +55,8 @@ void TimeExtractor::extract(const QString &input, ParsedIntent &intent) } } else if (typeStr == "custom") { parseCustomTime(match, metadata, tc); + } else if (typeStr == "relative") { + parseRelativeTime(metadata, tc); } if (tc.isValid()) { @@ -147,6 +150,23 @@ void TimeExtractor::parseCustomTime(const QRegularExpressionMatch &match, } } +void TimeExtractor::parseRelativeTime(const QVariantMap &metadata, TimeConstraint &tc) +{ + const QDateTime now = QDateTime::currentDateTime(); + const int agoEndSecs = metadata.value("ago_end_seconds", 0).toInt(); + const int agoStartSecs = metadata.value("ago_start_seconds", 0).toInt(); + + tc.kind = TimeConstraintKind::Relative; + tc.customEnd = now.addSecs(-agoEndSecs); + + if (agoStartSecs < 0) { + // Sentinel: "from epoch" + tc.customStart = QDateTime::fromMSecsSinceEpoch(0); + } else { + tc.customStart = now.addSecs(-agoStartSecs); + } +} + int TimeExtractor::localeAwareToInt(const QString &input, const QMap &digitMap, const QString &tensUnit) diff --git a/src/dfm-search/dfm-search-lib/semantic/extractors/timeextractor.h b/src/dfm-search/dfm-search-lib/semantic/extractors/timeextractor.h index 7fce9c24..66774f6b 100644 --- a/src/dfm-search/dfm-search-lib/semantic/extractors/timeextractor.h +++ b/src/dfm-search/dfm-search-lib/semantic/extractors/timeextractor.h @@ -24,6 +24,7 @@ class TimeExtractor : public DimensionExtractor private: void parseCustomTime(const QRegularExpressionMatch &match, const QVariantMap &metadata, TimeConstraint &tc); + void parseRelativeTime(const QVariantMap &metadata, TimeConstraint &tc); /** * @brief Convert a string to int using locale-aware digit mapping. diff --git a/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/time_rules.json b/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/time_rules.json index 318b2ae1..bd125279 100644 --- a/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/time_rules.json +++ b/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/time_rules.json @@ -229,6 +229,58 @@ }, "tens_unit": "十" } + }, + { + "id": "time_just_now", + "pattern": "这会儿|刚才|刚刚|刚", + "description": "Just now (last 2 hours)", + "enabled": true, + "priority": 80, + "metadata": { + "type": "relative", + "relative_id": "just_now", + "ago_start_seconds": 7200, + "ago_end_seconds": 0 + } + }, + { + "id": "time_recent_days", + "pattern": "这阵子|近期|最近|这几天", + "description": "Recent days (last 3 days)", + "enabled": true, + "priority": 80, + "metadata": { + "type": "relative", + "relative_id": "recent_days", + "ago_start_seconds": 259200, + "ago_end_seconds": 0 + } + }, + { + "id": "time_past_few_days", + "pattern": "那些天|之前几天|前几天", + "description": "Past few days (3-7 days ago)", + "enabled": true, + "priority": 80, + "metadata": { + "type": "relative", + "relative_id": "past_few_days", + "ago_start_seconds": 604800, + "ago_end_seconds": 259200 + } + }, + { + "id": "time_a_while_ago", + "pattern": "早些时候|以前|之前", + "description": "A while ago (beyond 30 days)", + "enabled": true, + "priority": 80, + "metadata": { + "type": "relative", + "relative_id": "a_while_ago", + "ago_start_seconds": -1, + "ago_end_seconds": 2592000 + } } ] } diff --git a/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.cpp b/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.cpp index 44a1f477..25cb5836 100644 --- a/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.cpp +++ b/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.cpp @@ -162,7 +162,7 @@ TimeRangeFilter SemanticQueryBuilder::buildTimeRangeFilter(const TimeConstraint } break; case TimeConstraintKind::Relative: - filter.setLast(tc.relativeValue, tc.relativeUnit); + filter.setRange(tc.customStart, tc.customEnd); break; case TimeConstraintKind::Custom: filter.setRange(tc.customStart, tc.customEnd); From 4cbf3b05b21977f4e8cdaab7bdeda92608038262 Mon Sep 17 00:00:00 2001 From: Zhang Sheng Date: Thu, 14 May 2026 15:39:06 +0800 Subject: [PATCH 04/36] feat: implement file size range filtering MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Added SizeRangeFilter class for file size range filtering functionality 2. Added SizeParser utility for human-readable size string parsing (e.g. "1K", "10M") 3. Implemented file size filtering in both indexed and real-time search strategies 4. Added file size numeric field support in search results 5. Integrated size filtering into CLI with --size-min and --size-max options 6. Added comprehensive unit tests for all new functionality Log: Added file size range filtering support for search operations Influence: 1. Test searching with various size ranges (min, max, both) 2. Test different size formats (K, M, G, T suffixes) 3. Verify boundary cases (0-size, max qint64 values) 4. Test combination with other filters like time range 5. Verify CLI interface with --size-min and --size-max feat: 实现文件大小范围过滤功能 1. 新增 SizeRangeFilter 类用于文件大小范围筛选 2. 添加 SizeParser 工具类用于解析可读性良好的大小字符串(如"1K","10M") 3. 在索引和实时搜索策略中实现文件大小过滤功能 4. 在搜索结果中添加文件大小数值字段支持 5. 在命令行接口中集成大小过滤功能(--size-min 和 --size-max 选项) 6. 为所有新功能添加全面的单元测试 Log: 为搜索操作添加文件大小范围过滤支持 Influence: 1. 测试不同大小范围的搜索(最小值、最大值、同时设置) 2. 测试不同大小格式(K、M、G、T 后缀) 3. 验证边界情况(0大小、qint64最大值) 4. 测试与其他过滤条件的组合使用(如时间范围) 5. 验证命令行接口的 --size-min 和 --size-max 选项 --- autotests/dfm-search-tests/CMakeLists.txt | 6 + autotests/dfm-search-tests/main.cpp | 5 + .../tst_size_range_filter.cpp | 374 ++++++++++++++++++ include/dfm-search/dfm-search/field_names.h | 1 + .../dfm-search/dfm-search/filenamesearchapi.h | 14 + include/dfm-search/dfm-search/searchoptions.h | 35 ++ .../dfm-search/dfm-search/sizerangefilter.h | 133 +++++++ .../dfm-search-client/CMakeLists.txt | 2 + .../dfm-search-client/cli_options.cpp | 48 ++- .../dfm-search-client/cli_options.h | 9 + src/dfm-search/dfm-search-client/main.cpp | 5 + .../dfm-search-client/output/json_output.cpp | 36 ++ .../dfm-search-client/output/text_output.cpp | 21 +- .../dfm-search-client/size_parser.cpp | 63 +++ .../dfm-search-client/size_parser.h | 39 ++ .../dfm-search-lib/core/searchoptions.cpp | 20 + .../dfm-search-lib/core/searchoptionsdata.h | 2 + .../dfm-search-lib/core/sizerangefilter.cpp | 126 ++++++ .../filenamesearch/filenamesearchapi.cpp | 12 + .../filenamesearch/filenamesearchengine.cpp | 5 +- .../filenamestrategies/indexedstrategy.cpp | 26 ++ .../filenamestrategies/realtimestrategy.cpp | 29 +- 22 files changed, 1004 insertions(+), 7 deletions(-) create mode 100644 autotests/dfm-search-tests/tst_size_range_filter.cpp create mode 100644 include/dfm-search/dfm-search/sizerangefilter.h create mode 100644 src/dfm-search/dfm-search-client/size_parser.cpp create mode 100644 src/dfm-search/dfm-search-client/size_parser.h create mode 100644 src/dfm-search/dfm-search-lib/core/sizerangefilter.cpp diff --git a/autotests/dfm-search-tests/CMakeLists.txt b/autotests/dfm-search-tests/CMakeLists.txt index 3aa002e0..6d1bb862 100644 --- a/autotests/dfm-search-tests/CMakeLists.txt +++ b/autotests/dfm-search-tests/CMakeLists.txt @@ -24,10 +24,16 @@ target_link_libraries(dfm-search-test ${QT_TEST_LIB} ) +# Add size_parser source for testing (it's part of dfm-searcher client but we test it here) +target_sources(dfm-search-test PRIVATE + ${CMAKE_SOURCE_DIR}/src/dfm-search/dfm-search-client/size_parser.cpp +) + target_include_directories(dfm-search-test PRIVATE ${CMAKE_SOURCE_DIR}/src/dfm-search ${CMAKE_SOURCE_DIR}/src/dfm-search/dfm-search-lib + ${CMAKE_SOURCE_DIR}/src/dfm-search/dfm-search-client ) # Pass source directory for locating rule files at runtime diff --git a/autotests/dfm-search-tests/main.cpp b/autotests/dfm-search-tests/main.cpp index 96b6b4ca..31282df7 100644 --- a/autotests/dfm-search-tests/main.cpp +++ b/autotests/dfm-search-tests/main.cpp @@ -15,6 +15,7 @@ extern QObject *create_tst_FileTypeExtraction(); extern QObject *create_tst_KeywordExtraction(); extern QObject *create_tst_ParsedIntent(); extern QObject *create_tst_ChineseNLP(); +extern QObject *create_tst_SizeRangeFilter(); int main(int argc, char *argv[]) { @@ -61,5 +62,9 @@ int main(int argc, char *argv[]) result |= QTest::qExec(testObj10, argc, argv); delete testObj10; + QObject *testObj11 = create_tst_SizeRangeFilter(); + result |= QTest::qExec(testObj11, argc, argv); + delete testObj11; + return result; } diff --git a/autotests/dfm-search-tests/tst_size_range_filter.cpp b/autotests/dfm-search-tests/tst_size_range_filter.cpp new file mode 100644 index 00000000..ca361e22 --- /dev/null +++ b/autotests/dfm-search-tests/tst_size_range_filter.cpp @@ -0,0 +1,374 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#include +#include +#include +#include +#include + +#include "size_parser.h" + +using namespace DFMSEARCH; + +class tst_SizeRangeFilter : public QObject +{ + Q_OBJECT + +private Q_SLOTS: + // SizeRangeFilter tests + void testDefaultState(); + void testSetMin(); + void testSetMax(); + void testSetRange(); + void testFluentChaining(); + void testBoundaryControl(); + void testCopyConstructor(); + void testMoveConstructor(); + void testClear(); + void testIsValid(); + + // SizeParser tests + void testParseSizeBytes(); + void testParseSizeKilobytes(); + void testParseSizeMegabytes(); + void testParseSizeGigabytes(); + void testParseSizeTerabytes(); + void testParseSizeCaseInsensitive(); + void testParseSizeInvalid(); + void testParseSizeEmpty(); + void testParseSizeWithSpaces(); + + // SearchOptions integration + void testSearchOptionsSizeFilter(); + void testSearchOptionsClearSizeFilter(); + + // FileNameResultAPI integration + void testFileNameResultAPIFileSizeBytes(); +}; + +// ==================== SizeRangeFilter Tests ==================== + +void tst_SizeRangeFilter::testDefaultState() +{ + SizeRangeFilter filter; + QCOMPARE(filter.minSize(), 0); + QCOMPARE(filter.maxSize(), 0); + QCOMPARE(filter.includeLower(), true); + QCOMPARE(filter.includeUpper(), true); + QCOMPARE(filter.isValid(), false); +} + +void tst_SizeRangeFilter::testSetMin() +{ + SizeRangeFilter filter; + filter.setMin(1024); + + QCOMPARE(filter.minSize(), 1024); + QCOMPARE(filter.isValid(), true); +} + +void tst_SizeRangeFilter::testSetMax() +{ + SizeRangeFilter filter; + filter.setMax(10 * 1024 * 1024); + + QCOMPARE(filter.maxSize(), 10 * 1024 * 1024); + QCOMPARE(filter.isValid(), true); +} + +void tst_SizeRangeFilter::testSetRange() +{ + SizeRangeFilter filter; + filter.setRange(1024, 10 * 1024 * 1024); + + QCOMPARE(filter.minSize(), 1024); + QCOMPARE(filter.maxSize(), 10 * 1024 * 1024); + QCOMPARE(filter.isValid(), true); +} + +void tst_SizeRangeFilter::testFluentChaining() +{ + SizeRangeFilter filter; + auto &ref = filter.setMin(1024).setMax(10 * 1024 * 1024); + + QCOMPARE(filter.minSize(), 1024); + QCOMPARE(filter.maxSize(), 10 * 1024 * 1024); + QCOMPARE(&ref, &filter); // 返回自身的引用 +} + +void tst_SizeRangeFilter::testBoundaryControl() +{ + SizeRangeFilter filter; + filter.setRange(1024, 10 * 1024 * 1024); + filter.setIncludeLower(false); + filter.setIncludeUpper(false); + + QCOMPARE(filter.includeLower(), false); + QCOMPARE(filter.includeUpper(), false); +} + +void tst_SizeRangeFilter::testCopyConstructor() +{ + SizeRangeFilter original; + original.setRange(1024, 10 * 1024 * 1024); + original.setIncludeLower(false); + + SizeRangeFilter copy(original); + QCOMPARE(copy.minSize(), 1024); + QCOMPARE(copy.maxSize(), 10 * 1024 * 1024); + QCOMPARE(copy.includeLower(), false); + QCOMPARE(copy.includeUpper(), true); + QCOMPARE(copy.isValid(), true); +} + +void tst_SizeRangeFilter::testMoveConstructor() +{ + SizeRangeFilter original; + original.setRange(1024, 10 * 1024 * 1024); + + SizeRangeFilter moved(std::move(original)); + QCOMPARE(moved.minSize(), 1024); + QCOMPARE(moved.maxSize(), 10 * 1024 * 1024); + QCOMPARE(moved.isValid(), true); +} + +void tst_SizeRangeFilter::testClear() +{ + SizeRangeFilter filter; + filter.setRange(1024, 10 * 1024 * 1024); + filter.setIncludeLower(false); + filter.setIncludeUpper(false); + + filter.clear(); + + QCOMPARE(filter.minSize(), 0); + QCOMPARE(filter.maxSize(), 0); + QCOMPARE(filter.includeLower(), true); // 重置为默认值 + QCOMPARE(filter.includeUpper(), true); + QCOMPARE(filter.isValid(), false); +} + +void tst_SizeRangeFilter::testIsValid() +{ + SizeRangeFilter filter; + + // 默认状态无效 + QCOMPARE(filter.isValid(), false); + + // 设置 min 后有效 + filter.setMin(1); + QCOMPARE(filter.isValid(), true); + filter.clear(); + + // 设置 max 后有效 + filter.setMax(1); + QCOMPARE(filter.isValid(), true); + filter.clear(); + + // 设置 0 值仍无效 + filter.setMin(0); + QCOMPARE(filter.isValid(), false); + filter.setMax(0); + QCOMPARE(filter.isValid(), false); +} + +// ==================== SizeParser Tests ==================== + +void tst_SizeRangeFilter::testParseSizeBytes() +{ + qint64 bytes = 0; + + QVERIFY(dfmsearch::SizeParser::parseSize("512", bytes)); + QCOMPARE(bytes, 512); + + QVERIFY(dfmsearch::SizeParser::parseSize("0", bytes)); + QCOMPARE(bytes, 0); + + QVERIFY(dfmsearch::SizeParser::parseSize("1024", bytes)); + QCOMPARE(bytes, 1024); +} + +void tst_SizeRangeFilter::testParseSizeKilobytes() +{ + qint64 bytes = 0; + + QVERIFY(dfmsearch::SizeParser::parseSize("1K", bytes)); + QCOMPARE(bytes, 1024); + + QVERIFY(dfmsearch::SizeParser::parseSize("1KB", bytes)); + QCOMPARE(bytes, 1024); + + QVERIFY(dfmsearch::SizeParser::parseSize("10K", bytes)); + QCOMPARE(bytes, 10240); + + QVERIFY(dfmsearch::SizeParser::parseSize("1.5K", bytes)); + QCOMPARE(bytes, 1536); +} + +void tst_SizeRangeFilter::testParseSizeMegabytes() +{ + qint64 bytes = 0; + + QVERIFY(dfmsearch::SizeParser::parseSize("1M", bytes)); + QCOMPARE(bytes, 1024 * 1024); + + QVERIFY(dfmsearch::SizeParser::parseSize("1MB", bytes)); + QCOMPARE(bytes, 1024 * 1024); + + QVERIFY(dfmsearch::SizeParser::parseSize("10M", bytes)); + QCOMPARE(bytes, 10LL * 1024 * 1024); + + QVERIFY(dfmsearch::SizeParser::parseSize("1.5M", bytes)); + QCOMPARE(bytes, static_cast(1.5 * 1024 * 1024)); +} + +void tst_SizeRangeFilter::testParseSizeGigabytes() +{ + qint64 bytes = 0; + + QVERIFY(dfmsearch::SizeParser::parseSize("1G", bytes)); + QCOMPARE(bytes, 1024LL * 1024 * 1024); + + QVERIFY(dfmsearch::SizeParser::parseSize("1GB", bytes)); + QCOMPARE(bytes, 1024LL * 1024 * 1024); + + QVERIFY(dfmsearch::SizeParser::parseSize("10G", bytes)); + QCOMPARE(bytes, 10LL * 1024 * 1024 * 1024); +} + +void tst_SizeRangeFilter::testParseSizeTerabytes() +{ + qint64 bytes = 0; + + QVERIFY(dfmsearch::SizeParser::parseSize("1T", bytes)); + QCOMPARE(bytes, 1024LL * 1024 * 1024 * 1024); + + QVERIFY(dfmsearch::SizeParser::parseSize("1TB", bytes)); + QCOMPARE(bytes, 1024LL * 1024 * 1024 * 1024); +} + +void tst_SizeRangeFilter::testParseSizeCaseInsensitive() +{ + qint64 bytes = 0; + + QVERIFY(dfmsearch::SizeParser::parseSize("1k", bytes)); + QCOMPARE(bytes, 1024); + + QVERIFY(dfmsearch::SizeParser::parseSize("1m", bytes)); + QCOMPARE(bytes, 1024 * 1024); + + QVERIFY(dfmsearch::SizeParser::parseSize("1g", bytes)); + QCOMPARE(bytes, 1024LL * 1024 * 1024); + + QVERIFY(dfmsearch::SizeParser::parseSize("1kb", bytes)); + QCOMPARE(bytes, 1024); + + QVERIFY(dfmsearch::SizeParser::parseSize("1mb", bytes)); + QCOMPARE(bytes, 1024 * 1024); +} + +void tst_SizeRangeFilter::testParseSizeInvalid() +{ + qint64 bytes = 0; + + // 未知后缀 + QVERIFY(!dfmsearch::SizeParser::parseSize("1X", bytes)); + + // 纯字母 + QVERIFY(!dfmsearch::SizeParser::parseSize("abc", bytes)); + + // 负数 + QVERIFY(!dfmsearch::SizeParser::parseSize("-1K", bytes)); + + // 空数字 + QVERIFY(!dfmsearch::SizeParser::parseSize("K", bytes)); +} + +void tst_SizeRangeFilter::testParseSizeEmpty() +{ + qint64 bytes = -1; + + QVERIFY(!dfmsearch::SizeParser::parseSize("", bytes)); + QCOMPARE(bytes, -1); // 不应被修改 +} + +void tst_SizeRangeFilter::testParseSizeWithSpaces() +{ + qint64 bytes = 0; + + QVERIFY(dfmsearch::SizeParser::parseSize(" 1M ", bytes)); + QCOMPARE(bytes, 1024 * 1024); + + QVERIFY(dfmsearch::SizeParser::parseSize(" 512 ", bytes)); + QCOMPARE(bytes, 512); +} + +// ==================== SearchOptions Integration Tests ==================== + +void tst_SizeRangeFilter::testSearchOptionsSizeFilter() +{ + SearchOptions options; + + // 默认无大小过滤 + QCOMPARE(options.hasSizeRangeFilter(), false); + + // 设置大小过滤 + SizeRangeFilter filter; + filter.setRange(1024, 10 * 1024 * 1024); + options.setSizeRangeFilter(filter); + + QCOMPARE(options.hasSizeRangeFilter(), true); + + SizeRangeFilter retrieved = options.sizeRangeFilter(); + QCOMPARE(retrieved.minSize(), 1024); + QCOMPARE(retrieved.maxSize(), 10 * 1024 * 1024); +} + +void tst_SizeRangeFilter::testSearchOptionsClearSizeFilter() +{ + SearchOptions options; + + SizeRangeFilter filter; + filter.setRange(1024, 10 * 1024 * 1024); + options.setSizeRangeFilter(filter); + QCOMPARE(options.hasSizeRangeFilter(), true); + + options.clearSizeRangeFilter(); + QCOMPARE(options.hasSizeRangeFilter(), false); + + SizeRangeFilter retrieved = options.sizeRangeFilter(); + QCOMPARE(retrieved.minSize(), 0); + QCOMPARE(retrieved.maxSize(), 0); +} + +// ==================== FileNameResultAPI Integration Tests ==================== + +void tst_SizeRangeFilter::testFileNameResultAPIFileSizeBytes() +{ + SearchResult result("/home/user/test.txt"); + FileNameResultAPI api(result); + + // 默认值 + QCOMPARE(api.fileSizeBytes(), 0); + + // 设置和获取 + api.setFileSizeBytes(1024); + QCOMPARE(api.fileSizeBytes(), 1024); + + // 设置大文件 + api.setFileSizeBytes(10LL * 1024 * 1024 * 1024); + QCOMPARE(api.fileSizeBytes(), 10LL * 1024 * 1024 * 1024); + + // 设置 0 + api.setFileSizeBytes(0); + QCOMPARE(api.fileSizeBytes(), 0); +} + +QObject *create_tst_SizeRangeFilter() +{ + return new tst_SizeRangeFilter(); +} + +#include "tst_size_range_filter.moc" diff --git a/include/dfm-search/dfm-search/field_names.h b/include/dfm-search/dfm-search/field_names.h index 2cdbf9f8..c1dfb51b 100644 --- a/include/dfm-search/dfm-search/field_names.h +++ b/include/dfm-search/dfm-search/field_names.h @@ -21,6 +21,7 @@ constexpr const wchar_t kFullPath[] = L"full_path"; constexpr const wchar_t kIsHidden[] = L"is_hidden"; constexpr const wchar_t kModifyTime[] = L"modify_time"; constexpr const wchar_t kBirthTime[] = L"birth_time"; +constexpr const wchar_t kFileSize[] = L"file_size"; constexpr const wchar_t kFileSizeStr[] = L"file_size_str"; constexpr const wchar_t kPinyin[] = L"pinyin"; constexpr const wchar_t kPinyinAcronym[] = L"pinyin_acronym"; diff --git a/include/dfm-search/dfm-search/filenamesearchapi.h b/include/dfm-search/dfm-search/filenamesearchapi.h index 4af8fba7..4093f34d 100644 --- a/include/dfm-search/dfm-search/filenamesearchapi.h +++ b/include/dfm-search/dfm-search/filenamesearchapi.h @@ -237,6 +237,20 @@ class FileNameResultAPI */ QString birthTimeString() const; + // ==================== File Size (Numeric) ==================== + + /** + * @brief Set the file size in bytes + * @param bytes File size in bytes + */ + void setFileSizeBytes(qint64 bytes); + + /** + * @brief Get the file size in bytes + * @return File size in bytes, 0 if not set + */ + qint64 fileSizeBytes() const; + private: SearchResult &m_result; }; diff --git a/include/dfm-search/dfm-search/searchoptions.h b/include/dfm-search/dfm-search/searchoptions.h index 6d22b5c9..b13f1819 100644 --- a/include/dfm-search/dfm-search/searchoptions.h +++ b/include/dfm-search/dfm-search/searchoptions.h @@ -10,6 +10,7 @@ #include #include +#include DFM_SEARCH_BEGIN_NS @@ -254,6 +255,40 @@ class SearchOptions */ void clearTimeRangeFilter(); + /** + * @brief Sets the file size range filter for search operations. + * + * The size range filter allows filtering search results based on file size in bytes. + * Supports setting minimum and/or maximum file size boundaries. + * + * @param filter The SizeRangeFilter to apply + * @sa sizeRangeFilter(), hasSizeRangeFilter(), clearSizeRangeFilter() + */ + void setSizeRangeFilter(const SizeRangeFilter &filter); + + /** + * @brief Returns the current file size range filter. + * + * @return The current SizeRangeFilter + * @sa setSizeRangeFilter() + */ + SizeRangeFilter sizeRangeFilter() const; + + /** + * @brief Checks if a file size range filter is set. + * + * @return true if a valid size range filter is set, false otherwise + * @sa setSizeRangeFilter(), clearSizeRangeFilter() + */ + bool hasSizeRangeFilter() const; + + /** + * @brief Clears the file size range filter. + * + * @sa setSizeRangeFilter(), hasSizeRangeFilter() + */ + void clearSizeRangeFilter(); + private: std::unique_ptr d; // PIMPL }; diff --git a/include/dfm-search/dfm-search/sizerangefilter.h b/include/dfm-search/dfm-search/sizerangefilter.h new file mode 100644 index 00000000..a99cf95b --- /dev/null +++ b/include/dfm-search/dfm-search/sizerangefilter.h @@ -0,0 +1,133 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later +#ifndef SIZERANGEFILTER_H +#define SIZERANGEFILTER_H + +#include + +#include + +DFM_SEARCH_BEGIN_NS + +class SizeRangeFilterData; + +/** + * @brief The SizeRangeFilter class provides file size range filtering for search operations. + * + * This class provides a fluent interface for specifying file size ranges. + * Size values are in bytes. + * + * Example usage: + * @code + * // Files between 1KB and 10MB + * SizeRangeFilter filter; + * filter.setRange(1024, 10 * 1024 * 1024); + * + * // Files larger than 1MB (including 1MB) + * SizeRangeFilter filter; + * filter.setMin(1024 * 1024).setIncludeLower(true); + * + * // Files smaller than 100KB (excluding 100KB) + * SizeRangeFilter filter; + * filter.setMax(100 * 1024).setIncludeUpper(false); + * @endcode + */ +class SizeRangeFilter +{ +public: + SizeRangeFilter(); + SizeRangeFilter(const SizeRangeFilter &other); + SizeRangeFilter(SizeRangeFilter &&other) noexcept; + ~SizeRangeFilter(); + + SizeRangeFilter &operator=(const SizeRangeFilter &other); + SizeRangeFilter &operator=(SizeRangeFilter &&other) noexcept; + + // ---------- Range Setting ---------- + + /** + * @brief Set the minimum file size in bytes + * @param minSize Minimum file size (0 means no lower bound) + * @return Reference to this filter for method chaining + */ + SizeRangeFilter &setMin(qint64 minSize); + + /** + * @brief Set the maximum file size in bytes + * @param maxSize Maximum file size (0 means no upper bound) + * @return Reference to this filter for method chaining + */ + SizeRangeFilter &setMax(qint64 maxSize); + + /** + * @brief Set both min and max file size in bytes + * @param minSize Minimum file size (0 means no lower bound) + * @param maxSize Maximum file size (0 means no upper bound) + * @return Reference to this filter for method chaining + */ + SizeRangeFilter &setRange(qint64 minSize, qint64 maxSize); + + // ---------- Accessors ---------- + + /** + * @brief Get the minimum file size + * @return Minimum file size in bytes (0 means no lower bound) + */ + qint64 minSize() const; + + /** + * @brief Get the maximum file size + * @return Maximum file size in bytes (0 means no upper bound) + */ + qint64 maxSize() const; + + // ---------- Boundary Control ---------- + + /** + * @brief Set whether the lower bound is inclusive + * @param include true to include the lower bound (default: true) + * @return Reference to this filter for method chaining + */ + SizeRangeFilter &setIncludeLower(bool include); + + /** + * @brief Set whether the upper bound is inclusive + * @param include true to include the upper bound (default: true) + * @return Reference to this filter for method chaining + */ + SizeRangeFilter &setIncludeUpper(bool include); + + /** + * @brief Check if lower bound is inclusive + * @return true if lower bound is inclusive + */ + bool includeLower() const; + + /** + * @brief Check if upper bound is inclusive + * @return true if upper bound is inclusive + */ + bool includeUpper() const; + + // ---------- Filter State ---------- + + /** + * @brief Clear the filter (make it invalid) + * @return Reference to this filter for method chaining + */ + SizeRangeFilter &clear(); + + /** + * @brief Check if the filter is valid (has at least one bound set) + * @return true if min or max is set (> 0) + */ + bool isValid() const; + +private: + std::unique_ptr d; +}; + +DFM_SEARCH_END_NS + +#endif // SIZERANGEFILTER_H diff --git a/src/dfm-search/dfm-search-client/CMakeLists.txt b/src/dfm-search/dfm-search-client/CMakeLists.txt index dcba205b..e630c69e 100644 --- a/src/dfm-search/dfm-search-client/CMakeLists.txt +++ b/src/dfm-search/dfm-search-client/CMakeLists.txt @@ -8,6 +8,8 @@ set(SRCS cli_options.h time_parser.cpp time_parser.h + size_parser.cpp + size_parser.h output/output_formatter.h output/text_output.cpp output/text_output.h diff --git a/src/dfm-search/dfm-search-client/cli_options.cpp b/src/dfm-search/dfm-search-client/cli_options.cpp index e2b078d0..38d740c7 100644 --- a/src/dfm-search/dfm-search-client/cli_options.cpp +++ b/src/dfm-search/dfm-search-client/cli_options.cpp @@ -4,6 +4,7 @@ #include "cli_options.h" #include "time_parser.h" +#include "size_parser.h" #include #include @@ -46,7 +47,9 @@ CliOptions::CliOptions() m_timeLastMonthOption(QStringList() << "time-last-month", "Filter files from last month"), m_timeThisYearOption(QStringList() << "time-this-year", "Filter files from this year"), m_timeLastYearOption(QStringList() << "time-last-year", "Filter files from last year"), - m_timeRangeOption(QStringList() << "time-range", "Custom time range (start,end)", "range") + m_timeRangeOption(QStringList() << "time-range", "Custom time range (start,end)", "range"), + m_sizeMinOption(QStringList() << "size-min", "Minimum file size (e.g., 1K, 10M, 1G, 512)", "size"), + m_sizeMaxOption(QStringList() << "size-max", "Maximum file size (e.g., 1K, 10M, 1G, 512)", "size") { setupOptions(); } @@ -87,6 +90,10 @@ void CliOptions::setupOptions() m_parser.addOption(m_timeLastYearOption); m_parser.addOption(m_timeRangeOption); + // File size range filtering options + m_parser.addOption(m_sizeMinOption); + m_parser.addOption(m_sizeMaxOption); + // 位置参数 m_parser.addPositionalArgument("keyword", "Search keyword"); m_parser.addPositionalArgument("search_path", "Path to search in"); @@ -136,6 +143,13 @@ void CliOptions::printHelp() const std::cout << " --time-range=, Custom time range (format: YYYY-MM-DD or \"YYYY-MM-DD HH:MM\")" << std::endl; std::cout << " Example: --time-range=\"2025-01-01,2025-12-31\"" << std::endl; std::cout << std::endl; + std::cout << "File Size Range Filter Options:" << std::endl; + std::cout << " --size-min= Minimum file size (e.g., 1K, 10M, 1G, 512)" << std::endl; + std::cout << " Units: K=KB, M=MB, G=GB, T=TB (default: bytes)" << std::endl; + std::cout << " --size-max= Maximum file size (e.g., 1K, 10M, 1G, 512)" << std::endl; + std::cout << " Units: K=KB, M=MB, G=GB, T=TB (default: bytes)" << std::endl; + std::cout << " Example: --size-min=1M --size-max=100M" << std::endl; + std::cout << std::endl; std::cout << "Output Options:" << std::endl; std::cout << " --json, -j Output results in JSON format" << std::endl; std::cout << " --verbose, -v Enable verbose output with detailed result information" << std::endl; @@ -159,6 +173,9 @@ void CliOptions::printHelp() const std::cout << std::endl; std::cout << " # Semantic search with JSON output" << std::endl; std::cout << " dfm-searcher -s -j \"content contains meeting notes\" /home/user" << std::endl; + std::cout << std::endl; + std::cout << " # Filename search with file size filter (1MB to 100MB)" << std::endl; + std::cout << " dfm-searcher --size-min=1M --size-max=100M \"video\" /home/user" << std::endl; } bool CliOptions::parse(QCoreApplication &app, SearchCliConfig &config) @@ -284,7 +301,34 @@ bool CliOptions::parse(QCoreApplication &app, SearchCliConfig &config) } // 解析时间范围选项 - return parseTimeOptions(config); + if (!parseTimeOptions(config)) { + return false; + } + + // 解析文件大小范围选项 + if (m_parser.isSet(m_sizeMinOption)) { + qint64 minBytes = 0; + if (SizeParser::parseSize(m_parser.value(m_sizeMinOption), minBytes) && minBytes > 0) { + config.sizeFilter.setMin(minBytes); + config.hasSizeFilter = true; + } else { + std::cerr << "Error: Invalid --size-min format. Use format like '1K', '10M', '1G', or '512'" << std::endl; + return false; + } + } + + if (m_parser.isSet(m_sizeMaxOption)) { + qint64 maxBytes = 0; + if (SizeParser::parseSize(m_parser.value(m_sizeMaxOption), maxBytes) && maxBytes > 0) { + config.sizeFilter.setMax(maxBytes); + config.hasSizeFilter = true; + } else { + std::cerr << "Error: Invalid --size-max format. Use format like '1K', '10M', '1G', or '512'" << std::endl; + return false; + } + } + + return true; } bool CliOptions::parseTimeOptions(SearchCliConfig &config) diff --git a/src/dfm-search/dfm-search-client/cli_options.h b/src/dfm-search/dfm-search-client/cli_options.h index 9edbd60e..6fce5f3f 100644 --- a/src/dfm-search/dfm-search-client/cli_options.h +++ b/src/dfm-search/dfm-search-client/cli_options.h @@ -12,6 +12,7 @@ #include #include #include +#include namespace dfmsearch { @@ -53,6 +54,10 @@ struct SearchCliConfig // Time range filtering bool hasTimeFilter = false; DFMSEARCH::TimeRangeFilter timeFilter; + + // File size range filtering + bool hasSizeFilter = false; + DFMSEARCH::SizeRangeFilter sizeFilter; }; /** @@ -117,6 +122,10 @@ class CliOptions QCommandLineOption m_timeThisYearOption; QCommandLineOption m_timeLastYearOption; QCommandLineOption m_timeRangeOption; + + // File size range filtering options + QCommandLineOption m_sizeMinOption; + QCommandLineOption m_sizeMaxOption; }; } // namespace dfmsearch diff --git a/src/dfm-search/dfm-search-client/main.cpp b/src/dfm-search/dfm-search-client/main.cpp index db1af343..202d9976 100644 --- a/src/dfm-search/dfm-search-client/main.cpp +++ b/src/dfm-search/dfm-search-client/main.cpp @@ -72,6 +72,11 @@ static void configureSearchOptions(SearchOptions &options, const SearchCliConfig if (config.hasTimeFilter) { options.setTimeRangeFilter(config.timeFilter); } + + // 应用文件大小范围过滤 + if (config.hasSizeFilter) { + options.setSizeRangeFilter(config.sizeFilter); + } } /** diff --git a/src/dfm-search/dfm-search-client/output/json_output.cpp b/src/dfm-search/dfm-search-client/output/json_output.cpp index b6208a2d..0034013b 100644 --- a/src/dfm-search/dfm-search-client/output/json_output.cpp +++ b/src/dfm-search/dfm-search-client/output/json_output.cpp @@ -40,6 +40,12 @@ QJsonValue JsonOutput::resultToJson(const SearchResult &result) if (!resultAPI.isDirectory()) { obj["fileType"] = resultAPI.fileType(); obj["size"] = resultAPI.size(); + + // 文件大小数值(字节) + qint64 fileSizeBytes = resultAPI.fileSizeBytes(); + if (fileSizeBytes > 0) { + obj["sizeBytes"] = fileSizeBytes; + } } QString filename = resultAPI.filename(); @@ -207,6 +213,21 @@ void JsonOutput::outputStreamingStart() searchInfo["timeRangeFilter"] = timeFilterInfo; } + // 添加文件大小范围过滤信息 + if (m_options.hasSizeRangeFilter()) { + DFMSEARCH::SizeRangeFilter sizeFilter = m_options.sizeRangeFilter(); + QJsonObject sizeFilterInfo; + if (sizeFilter.minSize() > 0) { + sizeFilterInfo["minBytes"] = sizeFilter.minSize(); + } + if (sizeFilter.maxSize() > 0) { + sizeFilterInfo["maxBytes"] = sizeFilter.maxSize(); + } + sizeFilterInfo["includeLower"] = sizeFilter.includeLower(); + sizeFilterInfo["includeUpper"] = sizeFilter.includeUpper(); + searchInfo["sizeRangeFilter"] = sizeFilterInfo; + } + startObj["search"] = searchInfo; startObj["timestamp"] = QDateTime::currentDateTime().toString(Qt::ISODate); @@ -300,6 +321,21 @@ void JsonOutput::outputCompleteResult(const QList &results) searchInfo["timeRangeFilter"] = timeFilterInfo; } + // 添加文件大小范围过滤信息 + if (m_options.hasSizeRangeFilter()) { + DFMSEARCH::SizeRangeFilter sizeFilter = m_options.sizeRangeFilter(); + QJsonObject sizeFilterInfo; + if (sizeFilter.minSize() > 0) { + sizeFilterInfo["minBytes"] = sizeFilter.minSize(); + } + if (sizeFilter.maxSize() > 0) { + sizeFilterInfo["maxBytes"] = sizeFilter.maxSize(); + } + sizeFilterInfo["includeLower"] = sizeFilter.includeLower(); + sizeFilterInfo["includeUpper"] = sizeFilter.includeUpper(); + searchInfo["sizeRangeFilter"] = sizeFilterInfo; + } + root["search"] = searchInfo; // 时间戳 diff --git a/src/dfm-search/dfm-search-client/output/text_output.cpp b/src/dfm-search/dfm-search-client/output/text_output.cpp index c1cca72e..63bef3ca 100644 --- a/src/dfm-search/dfm-search-client/output/text_output.cpp +++ b/src/dfm-search/dfm-search-client/output/text_output.cpp @@ -55,6 +55,20 @@ void TextOutput::outputSearchStarted() << " to " << end.toString("yyyy-MM-dd HH:mm:ss").toStdString(); std::cout << std::endl; } + + // 打印文件大小范围过滤 + if (m_options.hasSizeRangeFilter()) { + DFMSEARCH::SizeRangeFilter sizeFilter = m_options.sizeRangeFilter(); + std::cout << "Size range filter: "; + if (sizeFilter.minSize() > 0) { + std::cout << "min=" << sizeFilter.minSize() << " bytes"; + } + if (sizeFilter.maxSize() > 0) { + if (sizeFilter.minSize() > 0) std::cout << ", "; + std::cout << "max=" << sizeFilter.maxSize() << " bytes"; + } + std::cout << std::endl; + } } void TextOutput::printSearchResult(const SearchResult &result) @@ -75,7 +89,12 @@ void TextOutput::printSearchResult(const SearchResult &result) std::cout << " Type: Directory" << std::endl; } else { std::cout << " Type: " << resultAPI.fileType().toStdString() << std::endl; - std::cout << " Size: " << resultAPI.size().toStdString() << " bytes" << std::endl; + qint64 fileSizeBytes = resultAPI.fileSizeBytes(); + if (fileSizeBytes > 0) { + std::cout << " Size: " << fileSizeBytes << " bytes" << std::endl; + } else { + std::cout << " Size: " << resultAPI.size().toStdString() << " bytes" << std::endl; + } } // 文件名和扩展名 diff --git a/src/dfm-search/dfm-search-client/size_parser.cpp b/src/dfm-search/dfm-search-client/size_parser.cpp new file mode 100644 index 00000000..d28720d3 --- /dev/null +++ b/src/dfm-search/dfm-search-client/size_parser.cpp @@ -0,0 +1,63 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "size_parser.h" + +namespace dfmsearch { + +bool SizeParser::parseSize(const QString &arg, qint64 &bytes) +{ + if (arg.isEmpty()) { + return false; + } + + QString trimmed = arg.trimmed(); + if (trimmed.isEmpty()) { + return false; + } + + // 提取数字部分和单位后缀 + QString numStr; + QString suffix; + + for (int i = 0; i < trimmed.length(); ++i) { + QChar c = trimmed[i]; + if (c.isDigit() || c == '.') { + numStr += c; + } else { + suffix = trimmed.mid(i).trimmed().toUpper(); + break; + } + } + + if (numStr.isEmpty()) { + return false; + } + + bool ok = false; + double value = numStr.toDouble(&ok); + if (!ok || value < 0) { + return false; + } + + // 根据后缀计算字节数 + qint64 multiplier = 1; + if (suffix == "K" || suffix == "KB") { + multiplier = 1024LL; + } else if (suffix == "M" || suffix == "MB") { + multiplier = 1024LL * 1024; + } else if (suffix == "G" || suffix == "GB") { + multiplier = 1024LL * 1024 * 1024; + } else if (suffix == "T" || suffix == "TB") { + multiplier = 1024LL * 1024 * 1024 * 1024; + } else if (!suffix.isEmpty()) { + // 未知后缀 + return false; + } + + bytes = static_cast(value * multiplier); + return true; +} + +} // namespace dfmsearch diff --git a/src/dfm-search/dfm-search-client/size_parser.h b/src/dfm-search/dfm-search-client/size_parser.h new file mode 100644 index 00000000..5ece29ba --- /dev/null +++ b/src/dfm-search/dfm-search-client/size_parser.h @@ -0,0 +1,39 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef SIZE_PARSER_H +#define SIZE_PARSER_H + +#include + +namespace dfmsearch { + +/** + * @brief 文件大小参数解析工具类 + * + * 支持解析人类可读的文件大小字符串,如 "1K", "10M", "1G", "512" + * 不带后缀的纯数字视为字节数。 + */ +class SizeParser +{ +public: + /** + * @brief 解析文件大小字符串 + * @param arg 输入字符串(如 "1K", "10M", "1G", "512") + * @param bytes 输出字节数 + * @return 解析成功返回true + * + * 支持的后缀(不区分大小写): + * - K/KB: 千字节 (1024) + * - M/MB: 兆字节 (1024^2) + * - G/GB: 吉字节 (1024^3) + * - T/TB: 太字节 (1024^4) + * - 无后缀: 纯字节数 + */ + static bool parseSize(const QString &arg, qint64 &bytes); +}; + +} // namespace dfmsearch + +#endif // SIZE_PARSER_H diff --git a/src/dfm-search/dfm-search-lib/core/searchoptions.cpp b/src/dfm-search/dfm-search-lib/core/searchoptions.cpp index 8620b773..2657a28f 100644 --- a/src/dfm-search/dfm-search-lib/core/searchoptions.cpp +++ b/src/dfm-search/dfm-search-lib/core/searchoptions.cpp @@ -197,4 +197,24 @@ void SearchOptions::clearTimeRangeFilter() d->timeRangeFilter.clear(); } +void SearchOptions::setSizeRangeFilter(const SizeRangeFilter &filter) +{ + d->sizeRangeFilter = filter; +} + +SizeRangeFilter SearchOptions::sizeRangeFilter() const +{ + return d->sizeRangeFilter; +} + +bool SearchOptions::hasSizeRangeFilter() const +{ + return d->sizeRangeFilter.isValid(); +} + +void SearchOptions::clearSizeRangeFilter() +{ + d->sizeRangeFilter.clear(); +} + DFM_SEARCH_END_NS diff --git a/src/dfm-search/dfm-search-lib/core/searchoptionsdata.h b/src/dfm-search/dfm-search-lib/core/searchoptionsdata.h index b5813887..2ae5e77c 100644 --- a/src/dfm-search/dfm-search-lib/core/searchoptionsdata.h +++ b/src/dfm-search/dfm-search-lib/core/searchoptionsdata.h @@ -9,6 +9,7 @@ #include #include +#include DFM_SEARCH_BEGIN_NS @@ -39,6 +40,7 @@ class SearchOptionsData int syncSearchTimeoutSecs { 60 }; int batchTimeMs { 1000 }; ///< Batch processing time interval in milliseconds TimeRangeFilter timeRangeFilter; ///< Time range filter for search + SizeRangeFilter sizeRangeFilter; ///< File size range filter for search }; DFM_SEARCH_END_NS diff --git a/src/dfm-search/dfm-search-lib/core/sizerangefilter.cpp b/src/dfm-search/dfm-search-lib/core/sizerangefilter.cpp new file mode 100644 index 00000000..bc3b50b5 --- /dev/null +++ b/src/dfm-search/dfm-search-lib/core/sizerangefilter.cpp @@ -0,0 +1,126 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later +#include + +DFM_SEARCH_BEGIN_NS + +class SizeRangeFilterData +{ +public: + SizeRangeFilterData() + : minSize(0), maxSize(0), includeLower(true), includeUpper(true) + { + } + + SizeRangeFilterData(const SizeRangeFilterData &other) + : minSize(other.minSize), maxSize(other.maxSize), + includeLower(other.includeLower), includeUpper(other.includeUpper) + { + } + + qint64 minSize; + qint64 maxSize; + bool includeLower; + bool includeUpper; +}; + +SizeRangeFilter::SizeRangeFilter() + : d(std::make_unique()) +{ +} + +SizeRangeFilter::SizeRangeFilter(const SizeRangeFilter &other) + : d(std::make_unique(*other.d)) +{ +} + +SizeRangeFilter::SizeRangeFilter(SizeRangeFilter &&other) noexcept + : d(std::move(other.d)) +{ +} + +SizeRangeFilter::~SizeRangeFilter() = default; + +SizeRangeFilter &SizeRangeFilter::operator=(const SizeRangeFilter &other) +{ + if (this != &other) { + d = std::make_unique(*other.d); + } + return *this; +} + +SizeRangeFilter &SizeRangeFilter::operator=(SizeRangeFilter &&other) noexcept +{ + if (this != &other) { + d = std::move(other.d); + } + return *this; +} + +SizeRangeFilter &SizeRangeFilter::setMin(qint64 minSize) +{ + d->minSize = minSize; + return *this; +} + +SizeRangeFilter &SizeRangeFilter::setMax(qint64 maxSize) +{ + d->maxSize = maxSize; + return *this; +} + +SizeRangeFilter &SizeRangeFilter::setRange(qint64 minSize, qint64 maxSize) +{ + d->minSize = minSize; + d->maxSize = maxSize; + return *this; +} + +qint64 SizeRangeFilter::minSize() const +{ + return d->minSize; +} + +qint64 SizeRangeFilter::maxSize() const +{ + return d->maxSize; +} + +SizeRangeFilter &SizeRangeFilter::setIncludeLower(bool include) +{ + d->includeLower = include; + return *this; +} + +SizeRangeFilter &SizeRangeFilter::setIncludeUpper(bool include) +{ + d->includeUpper = include; + return *this; +} + +bool SizeRangeFilter::includeLower() const +{ + return d->includeLower; +} + +bool SizeRangeFilter::includeUpper() const +{ + return d->includeUpper; +} + +SizeRangeFilter &SizeRangeFilter::clear() +{ + d->minSize = 0; + d->maxSize = 0; + d->includeLower = true; + d->includeUpper = true; + return *this; +} + +bool SizeRangeFilter::isValid() const +{ + return d->minSize > 0 || d->maxSize > 0; +} + +DFM_SEARCH_END_NS diff --git a/src/dfm-search/dfm-search-lib/filenamesearch/filenamesearchapi.cpp b/src/dfm-search/dfm-search-lib/filenamesearch/filenamesearchapi.cpp index 8770125a..c47de316 100644 --- a/src/dfm-search/dfm-search-lib/filenamesearch/filenamesearchapi.cpp +++ b/src/dfm-search/dfm-search-lib/filenamesearch/filenamesearchapi.cpp @@ -168,4 +168,16 @@ QString FileNameResultAPI::birthTimeString() const return ts > 0 ? TimeResultAPI::formatTimestamp(ts) : QString(); } +// ==================== File Size (Numeric) ==================== + +void FileNameResultAPI::setFileSizeBytes(qint64 bytes) +{ + m_result.setCustomAttribute("fileSizeBytes", bytes); +} + +qint64 FileNameResultAPI::fileSizeBytes() const +{ + return m_result.customAttribute("fileSizeBytes").toLongLong(); +} + DFM_SEARCH_END_NS diff --git a/src/dfm-search/dfm-search-lib/filenamesearch/filenamesearchengine.cpp b/src/dfm-search/dfm-search-lib/filenamesearch/filenamesearchengine.cpp index 8b1085ef..301bf1a4 100644 --- a/src/dfm-search/dfm-search-lib/filenamesearch/filenamesearchengine.cpp +++ b/src/dfm-search/dfm-search-lib/filenamesearch/filenamesearchengine.cpp @@ -56,8 +56,9 @@ SearchError FileNameSearchEngine::validateSearchConditions() // 文件名搜索特定验证 if (m_currentQuery.type() == SearchQuery::Type::Simple || m_currentQuery.type() == SearchQuery::Type::Wildcard) { - // 允许对一个类型, 后缀进行搜索,获取类型下所有文件 - if (m_currentQuery.keyword().isEmpty() && fileTypes.isEmpty() && fileExts.isEmpty()) { + // 允许对类型/后缀/大小范围/时间范围进行搜索,获取满足条件的所有文件 + if (m_currentQuery.keyword().isEmpty() && fileTypes.isEmpty() && fileExts.isEmpty() + && !m_options.hasSizeRangeFilter() && !m_options.hasTimeRangeFilter()) { return SearchError(FileNameSearchErrorCode::KeywordIsEmpty); } diff --git a/src/dfm-search/dfm-search-lib/filenamesearch/filenamestrategies/indexedstrategy.cpp b/src/dfm-search/dfm-search-lib/filenamesearch/filenamestrategies/indexedstrategy.cpp index 6746fa2f..5adcbdd2 100644 --- a/src/dfm-search/dfm-search-lib/filenamesearch/filenamestrategies/indexedstrategy.cpp +++ b/src/dfm-search/dfm-search-lib/filenamesearch/filenamestrategies/indexedstrategy.cpp @@ -14,6 +14,7 @@ #include #include +#include #include "3rdparty/fulltext/chineseanalyzer.h" #include "utils/cancellablecollector.h" @@ -594,6 +595,16 @@ SearchResult FileNameIndexedStrategy::processDetailedSearchResult( QString size = QString::fromStdWString(doc->get(LuceneFieldNames::FileName::kFileSizeStr)); api.setSize(size); + // 文件大小(数值,字节) + QString fileSizeStr = QString::fromStdWString(doc->get(LuceneFieldNames::FileName::kFileSize)); + if (!fileSizeStr.isEmpty()) { + bool ok = false; + qint64 fileSizeBytes = fileSizeStr.toLongLong(&ok); + if (ok && fileSizeBytes >= 0) { + api.setFileSizeBytes(fileSizeBytes); + } + } + // 隐藏状态 QString isHiddenStr = QString::fromStdWString(doc->get(LuceneFieldNames::FileName::kIsHidden)).toLower(); api.setIsHidden(isHiddenStr == "y"); @@ -756,6 +767,21 @@ Lucene::QueryPtr FileNameIndexedStrategy::buildLuceneQuery(const IndexQuery &que } } + // Add file size range filter query + if (m_options.hasSizeRangeFilter()) { + SizeRangeFilter sizeFilter = m_options.sizeRangeFilter(); + + QueryPtr sizeQuery = TimeRangeUtils::buildNumericRangeQuery( + LuceneFieldNames::FileName::kFileSize, + sizeFilter.minSize(), sizeFilter.maxSize(), + sizeFilter.includeLower(), sizeFilter.includeUpper()); + + if (sizeQuery) { + finalQuery->add(sizeQuery, BooleanClause::MUST); + hasValidQuery = true; + } + } + // Add path prefix query optimization if (hasValidQuery && SearchUtility::isFilenameIndexAncestorPathsSupported() && SearchUtility::shouldUsePathPrefixQuery(searchPath)) { diff --git a/src/dfm-search/dfm-search-lib/filenamesearch/filenamestrategies/realtimestrategy.cpp b/src/dfm-search/dfm-search-lib/filenamesearch/filenamestrategies/realtimestrategy.cpp index dd0ff80f..e5c70609 100644 --- a/src/dfm-search/dfm-search-lib/filenamesearch/filenamestrategies/realtimestrategy.cpp +++ b/src/dfm-search/dfm-search-lib/filenamesearch/filenamestrategies/realtimestrategy.cpp @@ -12,6 +12,7 @@ #include #include +#include DFM_SEARCH_BEGIN_NS @@ -119,9 +120,9 @@ void FileNameRealTimeStrategy::search(const SearchQuery &query) QString fileName = info.fileName(); bool matches = false; - // 如果只有时间过滤没有关键词,直接匹配 + // 如果只有过滤条件(时间/大小)没有关键词,直接匹配 bool hasKeyword = !query.keyword().isEmpty() || query.type() == SearchQuery::Type::Boolean; - if (!hasKeyword && m_options.hasTimeRangeFilter()) { + if (!hasKeyword && (m_options.hasTimeRangeFilter() || m_options.hasSizeRangeFilter())) { matches = true; } // 简单查询模式 @@ -174,6 +175,29 @@ void FileNameRealTimeStrategy::search(const SearchQuery &query) matches = timeMatch; } + // 文件大小范围过滤 + if (matches && m_options.hasSizeRangeFilter()) { + SizeRangeFilter sizeFilter = m_options.sizeRangeFilter(); + qint64 fileSize = info.size(); + + bool sizeMatch = true; + if (sizeFilter.minSize() > 0) { + if (sizeFilter.includeLower()) { + sizeMatch = sizeMatch && (fileSize >= sizeFilter.minSize()); + } else { + sizeMatch = sizeMatch && (fileSize > sizeFilter.minSize()); + } + } + if (sizeFilter.maxSize() > 0) { + if (sizeFilter.includeUpper()) { + sizeMatch = sizeMatch && (fileSize <= sizeFilter.maxSize()); + } else { + sizeMatch = sizeMatch && (fileSize < sizeFilter.maxSize()); + } + } + matches = sizeMatch; + } + if (matches) { // 创建搜索结果 SearchResult result(info.filePath()); @@ -186,6 +210,7 @@ void FileNameRealTimeStrategy::search(const SearchQuery &query) api.setFileType(info.suffix().isEmpty() ? "unknown" : info.suffix().toLower()); api.setFileExtension(info.suffix().toLower()); api.setSize(QString::number(info.size())); + api.setFileSizeBytes(info.size()); } else { api.setFileType("dir"); } From fc6216da63b633fc902ef0e0cceaeab1488c5f11 Mon Sep 17 00:00:00 2001 From: Zhang Sheng Date: Thu, 14 May 2026 16:28:49 +0800 Subject: [PATCH 05/36] feat: add file size constraint support in semantic search MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Added comprehensive support for file size constraints in the semantic search system. Implemented size-related NLP rules (preset ranges like "large files", exact sizes like "greater than 100MB"), a new SizeExtractor class, size constraint parsing, and integration with query building. The changes include: 1. New SizeConstraint type in semantic_types.h to represent parsed size constraints 2. SizeExtractor class to process size expressions in natural language 3. Over 20 test cases covering all size constraint variations 4. Size constraint rules in size_rules.json with fuzzy ranges, exact values, and range expressions 5. Integration with SemanticQueryBuilder to build size filters during search Log: Added support for natural language size constraints in file search (e.g. "large files", ">100MB") Influence: 1. Test fuzzy size expressions ("large files", "small files") 2. Test exact size expressions (">500M", "<100K") 3. Test ranges ("1M-10M files") 4. Test combined constraints with time/type ("yesterday's large videos") 5. Verify all size units (B, KB, MB, GB) 6. Test edge cases (invalid formats, zero sizes) feat: 在语义搜索中添加文件大小约束支持 为语义搜索系统全面添加了文件大小约束的支持。实现了大小相关的自然语言处理 规则(预设范围如"大文件"、精确大小如"大于100MB")、新的SizeExtractor类、 大小约束解析以及与查询构建的集成。具体更改包括: 1. 在semantic_types.h中添加SizeConstraint类型表示解析后的大小约束 2. 处理自然语言大小表达式的SizeExtractor类 3. 20多个测试用例覆盖所有大小约束变体 4. size_rules.json中的大小约束规则,包括模糊范围、精确值和范围表达式 5. 与SemanticQueryBuilder集成以在搜索时构建大小过滤器 Log: 在文件搜索中添加了对自然语言大小约束的支持(如"大文件"、">100MB") Influence: 1. 测试模糊大小表达式("大文件"、"小文件") 2. 测试精确大小表达式(">500M"、"<100K") 3. 测试范围表达式("1M-10M的文件") 4. 测试与时间/类型组合的约束("昨天的大视频") 5. 验证所有大小单位(B、KB、MB、GB) 6. 测试边界情况(无效格式、零大小) --- .../dfm-search-tests/tst_chinese_nlp.cpp | 110 ++++++++++++++- .../dfm-search/dfm-search/semantic_types.h | 14 ++ .../semantic/extractors/sizeextractor.cpp | 126 ++++++++++++++++++ .../semantic/extractors/sizeextractor.h | 30 +++++ .../dfm-search-lib/semantic/intentparser.cpp | 2 + .../semantic/rules/zh_CN/size_rules.json | 70 ++++++++++ .../semantic/semanticquerybuilder.cpp | 21 ++- .../semantic/semanticquerybuilder.h | 3 +- 8 files changed, 370 insertions(+), 6 deletions(-) create mode 100644 src/dfm-search/dfm-search-lib/semantic/extractors/sizeextractor.cpp create mode 100644 src/dfm-search/dfm-search-lib/semantic/extractors/sizeextractor.h create mode 100644 src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/size_rules.json diff --git a/autotests/dfm-search-tests/tst_chinese_nlp.cpp b/autotests/dfm-search-tests/tst_chinese_nlp.cpp index b034d101..4268c139 100644 --- a/autotests/dfm-search-tests/tst_chinese_nlp.cpp +++ b/autotests/dfm-search-tests/tst_chinese_nlp.cpp @@ -96,6 +96,17 @@ private Q_SLOTS: void combined_monthAndType(); void combined_yearAndType(); + // Size tests + void size_fuzzy_large(); + void size_fuzzy_large_synonyms(); + void size_fuzzy_small(); + void size_dynamic_min(); + void size_dynamic_max(); + void size_dynamic_between(); + void size_combined_withTime(); + void size_combined_withType(); + void size_combined_full(); + // Relative time tests void timeRelative_justNow(); void timeRelative_justNow_synonyms(); @@ -149,7 +160,8 @@ void tst_ChineseNLP::initTestCase() QVERIFY2(QDir(dir).exists(), qPrintable(QStringLiteral("Rules dir not found: ") + dir)); const QStringList files = { "noise_rules.json", "time_rules.json", - "filetype_rules.json", "keyword_rules.json" }; + "filetype_rules.json", "keyword_rules.json", + "size_rules.json" }; for (const QString &f : files) { const QString path = dir + QLatin1Char('/') + f; bool ok = m_engine->loadRuleFile(path); @@ -161,17 +173,19 @@ void tst_ChineseNLP::initTestCase() QVERIFY(m_engine->hasGroup("filetype")); QVERIFY(m_engine->hasGroup("keyword")); QVERIFY(m_engine->hasGroup("noise")); + QVERIFY(m_engine->hasGroup("size")); const QStringList groups = m_engine->groupNames(); - QCOMPARE(groups.size(), 4); + QCOMPARE(groups.size(), 5); m_parser = new IntentParser(m_engine); // Verify default extractors are initialized QStringList names = m_parser->extractorNames(); - QCOMPARE(names.size(), 3); + QCOMPARE(names.size(), 4); QVERIFY(names.contains("time")); QVERIFY(names.contains("filetype")); + QVERIFY(names.contains("size")); QVERIFY(names.contains("keyword")); } @@ -1039,6 +1053,96 @@ void tst_ChineseNLP::combined_yearAndType() QVERIFY(intent.fileExtensions.contains("avi")); } +// ===== Size Tests ===== + +void tst_ChineseNLP::size_fuzzy_large() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("大文件"), intent); + QVERIFY(intent.sizeConstraint.isValid()); + QCOMPARE(intent.sizeConstraint.minSize, 524288000LL); // 500MB + QCOMPARE(intent.sizeConstraint.maxSize, 0LL); // no upper bound +} + +void tst_ChineseNLP::size_fuzzy_large_synonyms() +{ + const QStringList inputs = { QStringLiteral("很大的"), QStringLiteral("占空间的"), + QStringLiteral("几个G的") }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input + QStringLiteral("的图片"), intent); + QVERIFY2(intent.sizeConstraint.isValid(), + qPrintable(QStringLiteral("Size not valid for: ") + input)); + } +} + +void tst_ChineseNLP::size_fuzzy_small() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("小文件"), intent); + QVERIFY(intent.sizeConstraint.isValid()); + QCOMPARE(intent.sizeConstraint.minSize, 0LL); + QCOMPARE(intent.sizeConstraint.maxSize, 1048576LL); // 1MB + QCOMPARE(intent.sizeConstraint.includeUpper, false); +} + +void tst_ChineseNLP::size_dynamic_min() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("大于500M的文档"), intent); + QVERIFY(intent.sizeConstraint.isValid()); + QCOMPARE(intent.sizeConstraint.minSize, 524288000LL); // 500MB + QVERIFY(intent.sizeConstraint.includeLower); +} + +void tst_ChineseNLP::size_dynamic_max() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("小于100K的文件"), intent); + QVERIFY(intent.sizeConstraint.isValid()); + QCOMPARE(intent.sizeConstraint.maxSize, 102400LL); // 100KB + QCOMPARE(intent.sizeConstraint.minSize, 0LL); +} + +void tst_ChineseNLP::size_dynamic_between() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("1M-10M的文件"), intent); + QVERIFY(intent.sizeConstraint.isValid()); + QCOMPARE(intent.sizeConstraint.minSize, 1048576LL); // 1MB + QCOMPARE(intent.sizeConstraint.maxSize, 10485760LL); // 10MB +} + +void tst_ChineseNLP::size_combined_withTime() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("今天的大文件"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::Today); + QVERIFY(intent.sizeConstraint.isValid()); + QCOMPARE(intent.sizeConstraint.minSize, 524288000LL); +} + +void tst_ChineseNLP::size_combined_withType() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("大文件 pdf"), intent); + QVERIFY(intent.sizeConstraint.isValid()); + QVERIFY(intent.fileExtensions.contains("pdf")); +} + +void tst_ChineseNLP::size_combined_full() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("昨天大于100M的图片和视频"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::Yesterday); + QVERIFY(intent.sizeConstraint.isValid()); + QCOMPARE(intent.sizeConstraint.minSize, 104857600LL); // 100MB + QVERIFY(intent.fileExtensions.contains("jpg")); + QVERIFY(intent.fileExtensions.contains("mp4")); +} + // ===== Relative Time Tests ===== void tst_ChineseNLP::timeRelative_justNow() diff --git a/include/dfm-search/dfm-search/semantic_types.h b/include/dfm-search/dfm-search/semantic_types.h index f63be806..f3f71763 100644 --- a/include/dfm-search/dfm-search/semantic_types.h +++ b/include/dfm-search/dfm-search/semantic_types.h @@ -66,6 +66,19 @@ struct TimeConstraint bool isValid() const { return kind != TimeConstraintKind::None; } }; +/** + * @brief Represents a parsed size constraint from natural language. + */ +struct SizeConstraint +{ + qint64 minSize = 0; // Minimum size in bytes (0 = no lower bound) + qint64 maxSize = 0; // Maximum size in bytes (0 = no upper bound) + bool includeLower = true; + bool includeUpper = true; + + bool isValid() const { return minSize > 0 || maxSize > 0; } +}; + /** * @brief Represents the parsed intent from natural language input. * @@ -76,6 +89,7 @@ struct TimeConstraint struct ParsedIntent { TimeConstraint timeConstraint; + SizeConstraint sizeConstraint; QStringList fileExtensions; QStringList keywords; QList consumedSpans; diff --git a/src/dfm-search/dfm-search-lib/semantic/extractors/sizeextractor.cpp b/src/dfm-search/dfm-search-lib/semantic/extractors/sizeextractor.cpp new file mode 100644 index 00000000..7fe51f4c --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/extractors/sizeextractor.cpp @@ -0,0 +1,126 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "sizeextractor.h" + +#include "../semanticruleengine.h" + +#include + +DFM_SEARCH_BEGIN_NS + +SizeExtractor::SizeExtractor(SemanticRuleEngine *engine) + : m_engine(engine) +{ +} + +SizeExtractor::~SizeExtractor() = default; + +void SizeExtractor::extract(const QString &input, ParsedIntent &intent) +{ + if (!m_engine->hasGroup("size")) { + return; + } + + QString ruleId; + QRegularExpressionMatch match; + if (!m_engine->match("size", input, match, &ruleId)) { + return; + } + + const QVariantMap metadata = m_engine->ruleMetadata("size", ruleId); + const QString typeStr = metadata.value("type").toString(); + SizeConstraint sc; + + if (typeStr == "preset") { + sc.minSize = metadata.value("min_bytes", 0).toLongLong(); + sc.maxSize = metadata.value("max_bytes", 0).toLongLong(); + if (metadata.contains("include_upper")) { + sc.includeUpper = metadata.value("include_upper").toBool(); + } + if (metadata.contains("include_lower")) { + sc.includeLower = metadata.value("include_lower").toBool(); + } + } else if (typeStr == "dynamic") { + const QString direction = metadata.value("direction").toString(); + + if (direction == "min") { + const QString value = match.captured("value"); + const QString unit = match.captured("unit"); + qint64 bytes = parseSizeToBytes(value, unit); + if (bytes <= 0) { + return; + } + sc.minSize = bytes; + sc.includeLower = true; + } else if (direction == "max") { + const QString value = match.captured("value"); + const QString unit = match.captured("unit"); + qint64 bytes = parseSizeToBytes(value, unit); + if (bytes <= 0) { + return; + } + sc.maxSize = bytes; + sc.includeUpper = true; + } else if (direction == "range") { + const QString minVal = match.captured("min_val"); + const QString minUnit = match.captured("min_unit"); + const QString maxVal = match.captured("max_val"); + const QString maxUnit = match.captured("max_unit"); + qint64 minBytes = parseSizeToBytes(minVal, minUnit); + qint64 maxBytes = parseSizeToBytes(maxVal, maxUnit); + if (minBytes <= 0 || maxBytes <= 0) { + return; + } + sc.minSize = minBytes; + sc.maxSize = maxBytes; + } + } + + if (sc.isValid()) { + intent.sizeConstraint = sc; + MatchSpan span; + span.start = match.capturedStart(); + span.end = match.capturedEnd(); + span.ruleId = ruleId; + intent.consumedSpans.append(span); + } +} + +qint64 SizeExtractor::parseSizeToBytes(const QString &value, const QString &unit) +{ + bool ok = false; + double num = value.toDouble(&ok); + if (!ok || num <= 0) { + return -1; + } + + QString upperUnit = unit.toUpper(); + if (upperUnit.isEmpty()) { + // No unit: assume bytes + return static_cast(num); + } + if (upperUnit == "B" || upperUnit == "BB") { + return static_cast(num); + } + if (upperUnit == "K" || upperUnit == "KB") { + return static_cast(num * 1024); + } + if (upperUnit == "M" || upperUnit == "MB") { + return static_cast(num * 1024 * 1024); + } + if (upperUnit == "G" || upperUnit == "GB") { + return static_cast(num * 1024 * 1024 * 1024); + } + + qWarning() << "Unknown size unit:" << unit; + return -1; +} + +QString SizeExtractor::name() const +{ + return QStringLiteral("size"); +} + +DFM_SEARCH_END_NS diff --git a/src/dfm-search/dfm-search-lib/semantic/extractors/sizeextractor.h b/src/dfm-search/dfm-search-lib/semantic/extractors/sizeextractor.h new file mode 100644 index 00000000..d87a002a --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/extractors/sizeextractor.h @@ -0,0 +1,30 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef SIZEEXTRACTOR_H +#define SIZEEXTRACTOR_H + +#include + +DFM_SEARCH_BEGIN_NS + +class SemanticRuleEngine; + +class SizeExtractor : public DimensionExtractor +{ +public: + explicit SizeExtractor(SemanticRuleEngine *engine); + ~SizeExtractor() override; + + void extract(const QString &input, ParsedIntent &intent) override; + QString name() const override; + +private: + static qint64 parseSizeToBytes(const QString &value, const QString &unit); + SemanticRuleEngine *m_engine; +}; + +DFM_SEARCH_END_NS + +#endif // SIZEEXTRACTOR_H diff --git a/src/dfm-search/dfm-search-lib/semantic/intentparser.cpp b/src/dfm-search/dfm-search-lib/semantic/intentparser.cpp index 267fd3d9..db64d3fb 100644 --- a/src/dfm-search/dfm-search-lib/semantic/intentparser.cpp +++ b/src/dfm-search/dfm-search-lib/semantic/intentparser.cpp @@ -7,6 +7,7 @@ #include "semanticruleengine.h" #include "extractors/filetypeextractor.h" #include "extractors/keywordextractor.h" +#include "extractors/sizeextractor.h" #include "extractors/timeextractor.h" DFM_SEARCH_BEGIN_NS @@ -46,6 +47,7 @@ void IntentParser::initDefaultExtractors() // Order matters: keyword MUST be last (depends on consumedSpans) addExtractor(std::make_unique(m_engine)); addExtractor(std::make_unique(m_engine)); + addExtractor(std::make_unique(m_engine)); addExtractor(std::make_unique(m_engine)); } diff --git a/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/size_rules.json b/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/size_rules.json new file mode 100644 index 00000000..354ad260 --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/size_rules.json @@ -0,0 +1,70 @@ +{ + "version": "1.0.0", + "groups": [ + { + "name": "size", + "version": "1.0.0", + "locale": "zh-CN", + "rules": [ + { + "id": "size_fuzzy_large", + "pattern": "大文件|很大的|占空间的|几个G的|几个g的", + "description": "Fuzzy large files (>500MB)", + "enabled": true, + "priority": 200, + "metadata": { + "type": "preset", + "min_bytes": 524288000, + "max_bytes": 0 + } + }, + { + "id": "size_small", + "pattern": "小文件|很小的| tiny的", + "description": "Small files (<1MB)", + "enabled": true, + "priority": 200, + "metadata": { + "type": "preset", + "min_bytes": 0, + "max_bytes": 1048576, + "include_upper": false + } + }, + { + "id": "size_dynamic", + "pattern": "(大于|超过|最少|至少|>)\\s*(?\\d+(?:\\.\\d+)?)\\s*(?[KkMmGg][Bb]?)以?[上内]?", + "description": "Dynamic precise size (e.g. 大于500M, 1G以上)", + "enabled": true, + "priority": 150, + "metadata": { + "type": "dynamic", + "direction": "min" + } + }, + { + "id": "size_dynamic_less", + "pattern": "(小于|不超过|不到|最多|<)\\s*(?\\d+(?:\\.\\d+)?)\\s*(?[KkMmGg][Bb]?)以?[下内]?", + "description": "Dynamic precise size less-than (e.g. 小于100K, 不到1G)", + "enabled": true, + "priority": 150, + "metadata": { + "type": "dynamic", + "direction": "max" + } + }, + { + "id": "size_dynamic_between", + "pattern": "(?\\d+(?:\\.\\d+)?)\\s*(?[KkMmGg][Bb]?)[\\s~\\-到至]+(?\\d+(?:\\.\\d+)?)\\s*(?[KkMmGg][Bb]?)", + "description": "Size range (e.g. 1M-10M, 100K到500K)", + "enabled": true, + "priority": 150, + "metadata": { + "type": "dynamic", + "direction": "range" + } + } + ] + } + ] +} diff --git a/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.cpp b/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.cpp index 25cb5836..4113654a 100644 --- a/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.cpp +++ b/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.cpp @@ -20,7 +20,7 @@ SemanticSearchPlan SemanticQueryBuilder::build(const ParsedIntent &intent) SemanticSearchPlan plan; // Base options shared across all search paths - SearchOptions baseOpts = buildBaseOptions(intent.timeConstraint); + SearchOptions baseOpts = buildBaseOptions(intent.timeConstraint, intent.sizeConstraint); baseOpts.setSearchMethod(SearchMethod::Indexed); // --- File name search (always enabled) --- @@ -174,13 +174,30 @@ TimeRangeFilter SemanticQueryBuilder::buildTimeRangeFilter(const TimeConstraint return filter; } -SearchOptions SemanticQueryBuilder::buildBaseOptions(const TimeConstraint &tc) const +SizeRangeFilter SemanticQueryBuilder::buildSizeRangeFilter(const SizeConstraint &sc) const +{ + SizeRangeFilter filter; + if (!sc.isValid()) { + return filter; + } + filter.setMin(sc.minSize); + filter.setMax(sc.maxSize); + filter.setIncludeLower(sc.includeLower); + filter.setIncludeUpper(sc.includeUpper); + return filter; +} + +SearchOptions SemanticQueryBuilder::buildBaseOptions(const TimeConstraint &tc, const SizeConstraint &sc) const { SearchOptions opts; const TimeRangeFilter timeFilter = buildTimeRangeFilter(tc); if (timeFilter.isValid()) { opts.setTimeRangeFilter(timeFilter); } + const SizeRangeFilter sizeFilter = buildSizeRangeFilter(sc); + if (sizeFilter.isValid()) { + opts.setSizeRangeFilter(sizeFilter); + } return opts; } diff --git a/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.h b/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.h index 2f8c2136..88723c0e 100644 --- a/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.h +++ b/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.h @@ -44,7 +44,8 @@ class SemanticQueryBuilder private: TimeRangeFilter buildTimeRangeFilter(const TimeConstraint &tc) const; - SearchOptions buildBaseOptions(const TimeConstraint &tc) const; + SizeRangeFilter buildSizeRangeFilter(const SizeConstraint &sc) const; + SearchOptions buildBaseOptions(const TimeConstraint &tc, const SizeConstraint &sc) const; }; DFM_SEARCH_END_NS From 5ab5ca2bad2dda474041207996085456d788c5d9 Mon Sep 17 00:00:00 2001 From: Zhang Sheng Date: Thu, 14 May 2026 17:29:57 +0800 Subject: [PATCH 06/36] feat: add action-based time field search support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Added full support for action-based time field queries in semantic search: 1. Added new action_rules.json with create/modify action patterns 2. Added ActionExtractor to parse action rules 3. Implemented TimeField enum with Unspecified/Both options 4. Updated search builder and searcher to handle multiple time fields 5. Added comprehensive test cases for action-time field interaction Log: Added support for searching by file create/modify time using natural language actions like "新建的图片" or "修改过的文档" Influence: 1. Test natural language searches with create/modify action words (新/ 修改) 2. Verify single and compound time searches (e.g. "昨天修改过的文件") 3. Check default behavior with unspecified time fields 4. Test search performance with different time field combinations 5. Verify search results accuracy for both creation and modification times feat: 新增基于操作时间的文件搜索功能 为语义搜索添加了完整的基于操作时间的查询支持: 1. 新增action_rules.json包含创建/修改操作规则 2. 添加ActionExtractor解析操作规则 3. 实现TimeField枚举包含未指定/双重选项 4. 更新搜索构建器和搜索器处理多时间字段 5. 添加完整的操作-时间字段交互测试用例 Log: 新增支持使用自然语言操作词(如"新建的图片"/"修改过的文档")按文件创 建/修改时间搜索 Influence: 1. 测试带创建/修改操作词的自然语言搜索 2. 验证单个和复合时间搜索(如"昨天修改过的文件") 3. 检查未指定时间字段时的默认行为 4. 测试不同时间字段组合下的搜索性能 5. 验证创建时间和修改时间搜索结果的准确性 --- .../dfm-search-tests/tst_chinese_nlp.cpp | 110 ++++++++++++++++- .../dfm-search/dfm-search/dsearch_global.h | 6 +- .../dfm-search/dfm-search/semantic_types.h | 1 + .../semantic/extractors/actionextractor.cpp | 56 +++++++++ .../semantic/extractors/actionextractor.h | 29 +++++ .../dfm-search-lib/semantic/intentparser.cpp | 2 + .../semantic/rules/zh_CN/action_rules.json | 32 +++++ .../semantic/semanticquerybuilder.cpp | 19 +++ .../semantic/semanticquerybuilder.h | 1 + .../semantic/semanticsearcher.cpp | 116 ++++++++++-------- .../semantic/semanticsearcher_p.h | 6 +- 11 files changed, 316 insertions(+), 62 deletions(-) create mode 100644 src/dfm-search/dfm-search-lib/semantic/extractors/actionextractor.cpp create mode 100644 src/dfm-search/dfm-search-lib/semantic/extractors/actionextractor.h create mode 100644 src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/action_rules.json diff --git a/autotests/dfm-search-tests/tst_chinese_nlp.cpp b/autotests/dfm-search-tests/tst_chinese_nlp.cpp index 4268c139..a1c5fdbf 100644 --- a/autotests/dfm-search-tests/tst_chinese_nlp.cpp +++ b/autotests/dfm-search-tests/tst_chinese_nlp.cpp @@ -118,6 +118,15 @@ private Q_SLOTS: void timeRelative_aWhileAgo_synonyms(); void timeRelative_priority_vs_preset(); + // Action behavior tests + void action_create_birthTime(); + void action_create_synonyms(); + void action_modify_modifyTime(); + void action_modify_synonyms(); + void action_default_unspecified(); + void action_combined_withTime_create(); + void action_combined_withTime_modify(); + // Keyword tests void keyword_contains_single(); void keyword_contains_multi(); @@ -155,13 +164,13 @@ void tst_ChineseNLP::initTestCase() m_engine = new SemanticRuleEngine(this); - // Load all 4 rule files + // Load all 6 rule files const QString dir = rulesDir(); QVERIFY2(QDir(dir).exists(), qPrintable(QStringLiteral("Rules dir not found: ") + dir)); const QStringList files = { "noise_rules.json", "time_rules.json", "filetype_rules.json", "keyword_rules.json", - "size_rules.json" }; + "size_rules.json", "action_rules.json" }; for (const QString &f : files) { const QString path = dir + QLatin1Char('/') + f; bool ok = m_engine->loadRuleFile(path); @@ -174,18 +183,20 @@ void tst_ChineseNLP::initTestCase() QVERIFY(m_engine->hasGroup("keyword")); QVERIFY(m_engine->hasGroup("noise")); QVERIFY(m_engine->hasGroup("size")); + QVERIFY(m_engine->hasGroup("action")); const QStringList groups = m_engine->groupNames(); - QCOMPARE(groups.size(), 5); + QCOMPARE(groups.size(), 6); m_parser = new IntentParser(m_engine); // Verify default extractors are initialized QStringList names = m_parser->extractorNames(); - QCOMPARE(names.size(), 4); + QCOMPARE(names.size(), 5); QVERIFY(names.contains("time")); QVERIFY(names.contains("filetype")); QVERIFY(names.contains("size")); + QVERIFY(names.contains("action")); QVERIFY(names.contains("keyword")); } @@ -1247,6 +1258,97 @@ void tst_ChineseNLP::timeRelative_priority_vs_preset() QCOMPARE(intent.timeConstraint.preset, TimePreset::Today); } +// ===== Action Behavior Tests ===== + +void tst_ChineseNLP::action_create_birthTime() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("新建的图片"), intent); + QCOMPARE(intent.timeConstraint.timeField, TimeField::BirthTime); + // Action word should be consumed + bool actionConsumed = false; + for (const MatchSpan &span : intent.consumedSpans) { + if (span.ruleId == "action_create") { + actionConsumed = true; + break; + } + } + QVERIFY2(actionConsumed, "action_create should produce a consumed span"); +} + +void tst_ChineseNLP::action_create_synonyms() +{ + const QStringList inputs = { QStringLiteral("创建的文档"), QStringLiteral("存下来的图片"), + QStringLiteral("保存的文件"), QStringLiteral("新加的视频") }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input, intent); + QCOMPARE(intent.timeConstraint.timeField, TimeField::BirthTime); + } +} + +void tst_ChineseNLP::action_modify_modifyTime() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("修改过的图片"), intent); + QCOMPARE(intent.timeConstraint.timeField, TimeField::ModifyTime); + // Action word should be consumed + bool actionConsumed = false; + for (const MatchSpan &span : intent.consumedSpans) { + if (span.ruleId == "action_modify") { + actionConsumed = true; + break; + } + } + QVERIFY2(actionConsumed, "action_modify should produce a consumed span"); +} + +void tst_ChineseNLP::action_modify_synonyms() +{ + const QStringList inputs = { QStringLiteral("编辑过的文档"), QStringLiteral("改过的文件"), + QStringLiteral("写过的图片"), QStringLiteral("更新的视频") }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input, intent); + QCOMPARE(intent.timeConstraint.timeField, TimeField::ModifyTime); + } +} + +void tst_ChineseNLP::action_default_unspecified() +{ + // Without action words, timeField should remain Unspecified + ParsedIntent intent; + m_parser->parse(QStringLiteral("今天的图片"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::Today); + QCOMPARE(intent.timeConstraint.timeField, TimeField::Unspecified); +} + +void tst_ChineseNLP::action_combined_withTime_create() +{ + // "新建的今天的文档" — action_create + time today + ParsedIntent intent; + m_parser->parse(QStringLiteral("新建的今天的文档"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::Today); + QCOMPARE(intent.timeConstraint.timeField, TimeField::BirthTime); + // Both time and action spans consumed + int consumedCount = intent.consumedSpans.size(); + QVERIFY2(consumedCount >= 2, + qPrintable(QStringLiteral("Expected >=2 consumed spans, got ") + QString::number(consumedCount))); +} + +void tst_ChineseNLP::action_combined_withTime_modify() +{ + // "昨天修改过的图片" — time yesterday + action_modify + ParsedIntent intent; + m_parser->parse(QStringLiteral("昨天修改过的图片"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::Yesterday); + QCOMPARE(intent.timeConstraint.timeField, TimeField::ModifyTime); + QVERIFY(intent.fileExtensions.contains("jpg")); +} + QObject *create_tst_ChineseNLP() { return new tst_ChineseNLP(); diff --git a/include/dfm-search/dfm-search/dsearch_global.h b/include/dfm-search/dfm-search/dsearch_global.h index f18892cd..cfe05523 100644 --- a/include/dfm-search/dfm-search/dsearch_global.h +++ b/include/dfm-search/dfm-search/dsearch_global.h @@ -227,8 +227,10 @@ Q_ENUM_NS(SearchMethod) // Enumeration for time field type enum class TimeField { - BirthTime, // File creation time - ModifyTime // File modification time + Unspecified, // No specific time field (search both BirthTime and ModifyTime) + BirthTime, // File creation time + ModifyTime, // File modification time + Both // Search both BirthTime and ModifyTime (union of results) }; Q_ENUM_NS(TimeField) diff --git a/include/dfm-search/dfm-search/semantic_types.h b/include/dfm-search/dfm-search/semantic_types.h index f3f71763..7a591e3e 100644 --- a/include/dfm-search/dfm-search/semantic_types.h +++ b/include/dfm-search/dfm-search/semantic_types.h @@ -62,6 +62,7 @@ struct TimeConstraint TimeUnit relativeUnit = TimeUnit::Days; QDateTime customStart; QDateTime customEnd; + TimeField timeField = TimeField::Unspecified; // Set by ActionExtractor; Unspecified = no action specified bool isValid() const { return kind != TimeConstraintKind::None; } }; diff --git a/src/dfm-search/dfm-search-lib/semantic/extractors/actionextractor.cpp b/src/dfm-search/dfm-search-lib/semantic/extractors/actionextractor.cpp new file mode 100644 index 00000000..79e421e3 --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/extractors/actionextractor.cpp @@ -0,0 +1,56 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "actionextractor.h" + +#include "../semanticruleengine.h" + +#include + +DFM_SEARCH_BEGIN_NS + +ActionExtractor::ActionExtractor(SemanticRuleEngine *engine) + : m_engine(engine) +{ +} + +ActionExtractor::~ActionExtractor() = default; + +void ActionExtractor::extract(const QString &input, ParsedIntent &intent) +{ + if (!m_engine->hasGroup("action")) { + return; + } + + QString ruleId; + QRegularExpressionMatch match; + if (!m_engine->match("action", input, match, &ruleId)) { + return; + } + + const QVariantMap metadata = m_engine->ruleMetadata("action", ruleId); + const QString timeFieldStr = metadata.value("time_field").toString(); + + if (timeFieldStr == "birth") { + intent.timeConstraint.timeField = TimeField::BirthTime; + } else if (timeFieldStr == "modify") { + intent.timeConstraint.timeField = TimeField::ModifyTime; + } else { + qWarning() << "Unknown time_field in action rule:" << timeFieldStr; + return; + } + + MatchSpan span; + span.start = match.capturedStart(); + span.end = match.capturedEnd(); + span.ruleId = ruleId; + intent.consumedSpans.append(span); +} + +QString ActionExtractor::name() const +{ + return QStringLiteral("action"); +} + +DFM_SEARCH_END_NS diff --git a/src/dfm-search/dfm-search-lib/semantic/extractors/actionextractor.h b/src/dfm-search/dfm-search-lib/semantic/extractors/actionextractor.h new file mode 100644 index 00000000..b11ece8a --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/extractors/actionextractor.h @@ -0,0 +1,29 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef ACTIONEXTRACTOR_H +#define ACTIONEXTRACTOR_H + +#include + +DFM_SEARCH_BEGIN_NS + +class SemanticRuleEngine; + +class ActionExtractor : public DimensionExtractor +{ +public: + explicit ActionExtractor(SemanticRuleEngine *engine); + ~ActionExtractor() override; + + void extract(const QString &input, ParsedIntent &intent) override; + QString name() const override; + +private: + SemanticRuleEngine *m_engine; +}; + +DFM_SEARCH_END_NS + +#endif // ACTIONEXTRACTOR_H diff --git a/src/dfm-search/dfm-search-lib/semantic/intentparser.cpp b/src/dfm-search/dfm-search-lib/semantic/intentparser.cpp index db64d3fb..97abfae8 100644 --- a/src/dfm-search/dfm-search-lib/semantic/intentparser.cpp +++ b/src/dfm-search/dfm-search-lib/semantic/intentparser.cpp @@ -5,6 +5,7 @@ #include "intentparser.h" #include "semanticruleengine.h" +#include "extractors/actionextractor.h" #include "extractors/filetypeextractor.h" #include "extractors/keywordextractor.h" #include "extractors/sizeextractor.h" @@ -48,6 +49,7 @@ void IntentParser::initDefaultExtractors() addExtractor(std::make_unique(m_engine)); addExtractor(std::make_unique(m_engine)); addExtractor(std::make_unique(m_engine)); + addExtractor(std::make_unique(m_engine)); addExtractor(std::make_unique(m_engine)); } diff --git a/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/action_rules.json b/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/action_rules.json new file mode 100644 index 00000000..3ed04442 --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/action_rules.json @@ -0,0 +1,32 @@ +{ + "version": "1.0.0", + "groups": [ + { + "name": "action", + "version": "1.0.0", + "locale": "zh-CN", + "rules": [ + { + "id": "action_create", + "pattern": "新建的|创建的|存下来的|保存的|新加的", + "description": "Action: created files (BirthTime)", + "enabled": true, + "priority": 200, + "metadata": { + "time_field": "birth" + } + }, + { + "id": "action_modify", + "pattern": "修改过的|编辑过的|改过的|写过的|更新的", + "description": "Action: modified files (ModifyTime)", + "enabled": true, + "priority": 200, + "metadata": { + "time_field": "modify" + } + } + ] + } + ] +} diff --git a/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.cpp b/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.cpp index 4113654a..0262c470 100644 --- a/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.cpp +++ b/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.cpp @@ -19,6 +19,18 @@ SemanticSearchPlan SemanticQueryBuilder::build(const ParsedIntent &intent) { SemanticSearchPlan plan; + // Determine time field strategy + if (intent.timeConstraint.isValid() && intent.timeConstraint.timeField == TimeField::Unspecified) { + // Time constraint exists but no action specified → search both birth and modify time + plan.timeField = TimeField::Both; + } else if (intent.timeConstraint.timeField == TimeField::BirthTime) { + plan.timeField = TimeField::BirthTime; + } else if (intent.timeConstraint.timeField == TimeField::ModifyTime) { + plan.timeField = TimeField::ModifyTime; + } else { + plan.timeField = TimeField::ModifyTime; + } + // Base options shared across all search paths SearchOptions baseOpts = buildBaseOptions(intent.timeConstraint, intent.sizeConstraint); baseOpts.setSearchMethod(SearchMethod::Indexed); @@ -171,6 +183,13 @@ TimeRangeFilter SemanticQueryBuilder::buildTimeRangeFilter(const TimeConstraint break; } + // Set time field if explicitly specified by ActionExtractor + if (tc.timeField == TimeField::BirthTime || tc.timeField == TimeField::ModifyTime) { + filter.setTimeField(tc.timeField); + } + // When Unspecified or Both: timeField is NOT set on the filter here. + // The searcher handles Both by creating duplicate engines with each field. + return filter; } diff --git a/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.h b/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.h index 88723c0e..276a694c 100644 --- a/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.h +++ b/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.h @@ -24,6 +24,7 @@ struct SemanticSearchPlan { std::optional contentOptions; std::optional ocrQuery; std::optional ocrOptions; + TimeField timeField = TimeField::ModifyTime; // BirthTime, ModifyTime, or Both }; /** diff --git a/src/dfm-search/dfm-search-lib/semantic/semanticsearcher.cpp b/src/dfm-search/dfm-search-lib/semantic/semanticsearcher.cpp index 72518288..206fa8c6 100644 --- a/src/dfm-search/dfm-search-lib/semantic/semanticsearcher.cpp +++ b/src/dfm-search/dfm-search-lib/semantic/semanticsearcher.cpp @@ -63,7 +63,27 @@ void SemanticSearcherData::doSearch(const QString &naturalLanguage) // Step 2: Build search plan const SemanticSearchPlan plan = queryBuilder->build(intent); - // Step 3: Create and launch search engines in parallel + // Step 3: Determine time fields to search + QList timeFields; + if (plan.timeField == TimeField::Both) { + timeFields = {TimeField::BirthTime, TimeField::ModifyTime}; + } else { + timeFields = {plan.timeField}; + } + + // Helper: apply time field to options (clone + setTimeField if time filter present) + auto applyTimeField = [](const SearchOptions &opts, TimeField tf) -> SearchOptions { + TimeRangeFilter tfCopy = opts.timeRangeFilter(); + if (tfCopy.isValid()) { + tfCopy.setTimeField(tf); + SearchOptions result = opts; + result.setTimeRangeFilter(tfCopy); + return result; + } + return opts; + }; + + // Step 4: Create and launch search engines in parallel auto onResultsFound = [this](const SearchResultList &results) { SearchResultList newResults; for (const SearchResult &r : results) { @@ -104,56 +124,54 @@ void SemanticSearcherData::doSearch(const QString &naturalLanguage) pendingFinishCount.store(0); // Clean up any previous search engines (they have parent q, so Qt deletes them) - if (fileNameEngine) { - fileNameEngine->deleteLater(); - fileNameEngine = nullptr; - } - if (contentEngine) { - contentEngine->deleteLater(); - contentEngine = nullptr; - } - if (ocrEngine) { - ocrEngine->deleteLater(); - ocrEngine = nullptr; - } - - // File name search (always) - if (Global::isFileNameIndexReadyForSearch()) { - fileNameEngine = SearchEngine::create(SearchType::FileName, q); - fileNameEngine->setSearchOptions(plan.fileNameOptions); - - QObject::connect(fileNameEngine, &SearchEngine::resultsFound, q, onResultsFound); - QObject::connect(fileNameEngine, &SearchEngine::searchFinished, q, onFinished); - QObject::connect(fileNameEngine, &SearchEngine::errorOccurred, q, onError); - - pendingFinishCount.fetch_add(1); - fileNameEngine->search(plan.fileNameQuery); + for (SearchEngine *e : engines) { + e->deleteLater(); } + engines.clear(); + + // Launch engines for each time field (may be 1 or 2 for Both) + for (TimeField tf : timeFields) { + // File name search (always) + if (Global::isFileNameIndexReadyForSearch()) { + SearchEngine *engine = SearchEngine::create(SearchType::FileName, q); + engine->setSearchOptions(applyTimeField(plan.fileNameOptions, tf)); + + QObject::connect(engine, &SearchEngine::resultsFound, q, onResultsFound); + QObject::connect(engine, &SearchEngine::searchFinished, q, onFinished); + QObject::connect(engine, &SearchEngine::errorOccurred, q, onError); + + engines.append(engine); + pendingFinishCount.fetch_add(1); + engine->search(plan.fileNameQuery); + } - // Content search - if (plan.contentQuery.has_value() && plan.contentOptions.has_value()) { - contentEngine = SearchEngine::create(SearchType::Content, q); - contentEngine->setSearchOptions(*plan.contentOptions); + // Content search + if (plan.contentQuery.has_value() && plan.contentOptions.has_value()) { + SearchEngine *engine = SearchEngine::create(SearchType::Content, q); + engine->setSearchOptions(applyTimeField(*plan.contentOptions, tf)); - QObject::connect(contentEngine, &SearchEngine::resultsFound, q, onResultsFound); - QObject::connect(contentEngine, &SearchEngine::searchFinished, q, onFinished); - QObject::connect(contentEngine, &SearchEngine::errorOccurred, q, onError); + QObject::connect(engine, &SearchEngine::resultsFound, q, onResultsFound); + QObject::connect(engine, &SearchEngine::searchFinished, q, onFinished); + QObject::connect(engine, &SearchEngine::errorOccurred, q, onError); - pendingFinishCount.fetch_add(1); - contentEngine->search(*plan.contentQuery); - } + engines.append(engine); + pendingFinishCount.fetch_add(1); + engine->search(*plan.contentQuery); + } - // OCR search - if (plan.ocrQuery.has_value() && plan.ocrOptions.has_value()) { - ocrEngine = SearchEngine::create(SearchType::Ocr, q); - ocrEngine->setSearchOptions(*plan.ocrOptions); + // OCR search + if (plan.ocrQuery.has_value() && plan.ocrOptions.has_value()) { + SearchEngine *engine = SearchEngine::create(SearchType::Ocr, q); + engine->setSearchOptions(applyTimeField(*plan.ocrOptions, tf)); - QObject::connect(ocrEngine, &SearchEngine::resultsFound, q, onResultsFound); - QObject::connect(ocrEngine, &SearchEngine::searchFinished, q, onFinished); - QObject::connect(ocrEngine, &SearchEngine::errorOccurred, q, onError); + QObject::connect(engine, &SearchEngine::resultsFound, q, onResultsFound); + QObject::connect(engine, &SearchEngine::searchFinished, q, onFinished); + QObject::connect(engine, &SearchEngine::errorOccurred, q, onError); - pendingFinishCount.fetch_add(1); - ocrEngine->search(*plan.ocrQuery); + engines.append(engine); + pendingFinishCount.fetch_add(1); + engine->search(*plan.ocrQuery); + } } // If no engines were launched (e.g., no indexes available) @@ -175,14 +193,8 @@ void SemanticSearcherData::doCancel() cancelled.store(true); timeoutTimer->stop(); - if (fileNameEngine) { - fileNameEngine->cancel(); - } - if (contentEngine) { - contentEngine->cancel(); - } - if (ocrEngine) { - ocrEngine->cancel(); + for (SearchEngine *e : engines) { + e->cancel(); } } diff --git a/src/dfm-search/dfm-search-lib/semantic/semanticsearcher_p.h b/src/dfm-search/dfm-search-lib/semantic/semanticsearcher_p.h index 75fd4197..b0a1befd 100644 --- a/src/dfm-search/dfm-search-lib/semantic/semanticsearcher_p.h +++ b/src/dfm-search/dfm-search-lib/semantic/semanticsearcher_p.h @@ -39,10 +39,8 @@ class SemanticSearcherData IntentParser *intentParser = nullptr; SemanticQueryBuilder *queryBuilder = nullptr; - // Sub-engines (owned per search) - SearchEngine *fileNameEngine = nullptr; - SearchEngine *contentEngine = nullptr; - SearchEngine *ocrEngine = nullptr; + // Sub-engines (owned per search, parented to q for auto-cleanup) + QList engines; std::atomic pendingFinishCount{0}; // Result collection From 84f5aef2648e876bdd97aa74ac84e08ee65bd3a4 Mon Sep 17 00:00:00 2001 From: Zhang Sheng Date: Thu, 14 May 2026 17:50:35 +0800 Subject: [PATCH 07/36] fix: automatically handle hidden path search conditions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The changes implement automatic handling of hidden file search when user provides a path that contains hidden directories. Previously, users needed to explicitly use --include-hidden flag even when searching within hidden directories (like ~/.local/share/Trash). The modification: 1. Moves the includeHidden flag processing earlier in the parsing sequence 2. Adds auto-enable logic when detecting hidden path components 3. Preserves explicit user settings when --include-hidden is used Log: Search now automatically includes hidden files when searching within hidden directories Influence: 1. Test searches in hidden directories without --include-hidden flag 2. Verify searches in normal directories still respect includeHidden setting 3. Test explicit --include-hidden flag overrides the auto-detection 4. Verify mixed path cases (both hidden and non-hidden components) 5. Check backward compatibility with existing config files fix: 自动处理隐藏路径搜索条件 改动实现了当用户提供包含隐藏目录的路径时自动处理隐藏文件搜索。之前即使是 搜索隐藏目录(如~/.local/share/Trash)也要用户显式使用--include-hidden标 志。本次修改: 1. 将includeHidden标志处理移到解析顺序的前面 2. 增加检测到隐藏路径时自动启用的逻辑 3. 当使用--include-hidden时保留用户的显式设置 Log: 现在搜索隐藏目录时会自动包含隐藏文件 Influence: 1. 测试不使用--include-hidden标志搜索隐藏目录的情况 2. 验证在普通目录中搜索仍会遵守includeHidden设置 3. 测试显式的--include-hidden标志是否覆盖自动检测 4. 验证混合路径情况(包含隐藏和非隐藏部分) 5. 检查与现有配置文件的向后兼容性 --- src/dfm-search/dfm-search-client/cli_options.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/dfm-search/dfm-search-client/cli_options.cpp b/src/dfm-search/dfm-search-client/cli_options.cpp index 38d740c7..3c19b388 100644 --- a/src/dfm-search/dfm-search-client/cli_options.cpp +++ b/src/dfm-search/dfm-search-client/cli_options.cpp @@ -208,6 +208,15 @@ bool CliOptions::parse(QCoreApplication &app, SearchCliConfig &config) return false; } + // Auto-enable includeHidden when search path contains hidden directory components. + // User who explicitly specifies a hidden path (e.g. ~/.local/share/Trash) + // expects results without needing --include-hidden. + config.includeHidden = m_parser.isSet(m_includeHiddenOption); + if (!config.includeHidden && !config.searchPath.isEmpty() + && Global::isHiddenPathOrInHiddenDir(config.searchPath)) { + config.includeHidden = true; + } + // In semantic mode, skip type/method/query parsing if (config.semanticMode) { config.jsonOutput = m_parser.isSet(m_jsonOption); @@ -255,7 +264,6 @@ bool CliOptions::parse(QCoreApplication &app, SearchCliConfig &config) // 解析开关选项 config.caseSensitive = m_parser.isSet(m_caseSensitiveOption); - config.includeHidden = m_parser.isSet(m_includeHiddenOption); config.pinyinEnabled = m_parser.isSet(m_pinyinOption); config.pinyinAcronymEnabled = m_parser.isSet(m_pinyinAcronymOption); config.jsonOutput = m_parser.isSet(m_jsonOption); From 79a886da044a70f13b9236e61c9b89c72cf7a33b Mon Sep 17 00:00:00 2001 From: Zhang Sheng Date: Thu, 14 May 2026 20:09:59 +0800 Subject: [PATCH 08/36] feat: add location-based search support for Chinese NLP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Implemented LocationExtractor to parse and resolve standard directory names in Chinese 2. Updated ParsedIntent to include searchDirectories and includeHidden fields 3. Added location_rules.json with mappings for desktop, downloads, documents etc. 4. Included trash/recycle bin location handling with includeHidden flag 5. Modified search builder and searcher to support multiple directory searches 6. Added comprehensive test cases for all location scenarios Log: Added support for searching in specific directories like Desktop and Download with natural language queries in Chinese Influence: 1. Test natural language searches with location terms (e.g., "桌面上的 文档") 2. Verify correct path resolution for all standard directories 3. Test combined searches with locations and other filters 4. Verify trash/recycle bin searches include hidden files 5. Test multiple directory searches (e.g., "桌面和下载的图片") 6. Ensure backward compatibility with non-location queries feat: 为中文自然语言搜索添加基于位置的支持 1. 实现LocationExtractor解析和处理中文标准目录名称 2. 更新ParsedIntent结构体增加searchDirectories和includeHidden字段 3. 添加location_rules.json规则文件,包含桌面、下载、文档等目录映射 4. 支持回收站/垃圾箱的特殊处理(包含隐藏文件) 5. 修改搜索构建器和搜索器以支持多目录搜索 6. 添加全面的测试用例覆盖各种位置场景 Log: 新增支持中文自然语言查询在特定目录(如桌面、下载)的搜索功能 Influence: 1. 测试包含位置术语的自然语言搜索(如"桌面上的文档") 2. 验证所有标准目录的路径解析是否正确 3. 测试位置与其他条件组合的搜索 4. 验证回收站搜索是否包含隐藏文件 5. 测试多目录搜索功能(如"桌面和下载的图片") 6. 确保与非位置查询的向后兼容性 --- .../dfm-search-tests/tst_chinese_nlp.cpp | 144 +++++++++++++++++- .../dfm-search/dfm-search/semantic_types.h | 2 + .../semantic/extractors/locationextractor.cpp | 91 +++++++++++ .../semantic/extractors/locationextractor.h | 37 +++++ .../dfm-search-lib/semantic/intentparser.cpp | 2 + .../semantic/ruleconfigloader.cpp | 3 +- .../semantic/rules/zh_CN/location_rules.json | 100 ++++++++++++ .../semantic/rules/zh_CN/noise_rules.json | 8 + .../semantic/semanticquerybuilder.cpp | 4 + .../semantic/semanticquerybuilder.h | 2 + .../semantic/semanticsearcher.cpp | 94 +++++++----- 11 files changed, 448 insertions(+), 39 deletions(-) create mode 100644 src/dfm-search/dfm-search-lib/semantic/extractors/locationextractor.cpp create mode 100644 src/dfm-search/dfm-search-lib/semantic/extractors/locationextractor.h create mode 100644 src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/location_rules.json diff --git a/autotests/dfm-search-tests/tst_chinese_nlp.cpp b/autotests/dfm-search-tests/tst_chinese_nlp.cpp index a1c5fdbf..d1e3a3d2 100644 --- a/autotests/dfm-search-tests/tst_chinese_nlp.cpp +++ b/autotests/dfm-search-tests/tst_chinese_nlp.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include "semantic/intentparser.h" @@ -139,6 +140,18 @@ private Q_SLOTS: void noise_polite_words(); void noise_suffix_words(); + // Location tests + void location_desktop(); + void location_download(); + void location_documentsDir(); + void location_picturesDir(); + void location_musicDir(); + void location_videosDir(); + void location_trash(); + void location_deleted(); + void location_noLocation(); + void location_desktopAndDownload(); + // End-to-end combined tests void combined_timeAndFiletype(); void combined_timeAndFiletype_multi(); @@ -164,13 +177,14 @@ void tst_ChineseNLP::initTestCase() m_engine = new SemanticRuleEngine(this); - // Load all 6 rule files + // Load all 7 rule files const QString dir = rulesDir(); QVERIFY2(QDir(dir).exists(), qPrintable(QStringLiteral("Rules dir not found: ") + dir)); const QStringList files = { "noise_rules.json", "time_rules.json", "filetype_rules.json", "keyword_rules.json", - "size_rules.json", "action_rules.json" }; + "size_rules.json", "action_rules.json", + "location_rules.json" }; for (const QString &f : files) { const QString path = dir + QLatin1Char('/') + f; bool ok = m_engine->loadRuleFile(path); @@ -184,19 +198,21 @@ void tst_ChineseNLP::initTestCase() QVERIFY(m_engine->hasGroup("noise")); QVERIFY(m_engine->hasGroup("size")); QVERIFY(m_engine->hasGroup("action")); + QVERIFY(m_engine->hasGroup("location")); const QStringList groups = m_engine->groupNames(); - QCOMPARE(groups.size(), 6); + QCOMPARE(groups.size(), 7); m_parser = new IntentParser(m_engine); // Verify default extractors are initialized QStringList names = m_parser->extractorNames(); - QCOMPARE(names.size(), 5); + QCOMPARE(names.size(), 6); QVERIFY(names.contains("time")); QVERIFY(names.contains("filetype")); QVERIFY(names.contains("size")); QVERIFY(names.contains("action")); + QVERIFY(names.contains("location")); QVERIFY(names.contains("keyword")); } @@ -1349,6 +1365,126 @@ void tst_ChineseNLP::action_combined_withTime_modify() QVERIFY(intent.fileExtensions.contains("jpg")); } +// ===== Location Tests ===== + +void tst_ChineseNLP::location_desktop() +{ + // "桌面上的文档" → location(桌面) + filetype(文档) + const QString desktopPath = QStandardPaths::writableLocation(QStandardPaths::DesktopLocation); + ParsedIntent intent; + m_parser->parse(QStringLiteral("桌面上的文档"), intent); + QCOMPARE(intent.searchDirectories.size(), 1); + QCOMPARE(intent.searchDirectories.first(), desktopPath); + QVERIFY(!intent.fileExtensions.isEmpty()); // document type extensions + QVERIFY(intent.keywords.isEmpty()); +} + +void tst_ChineseNLP::location_download() +{ + // "下载里的图片" → location(下载) + filetype(图片) + const QString downloadPath = QStandardPaths::writableLocation(QStandardPaths::DownloadLocation); + ParsedIntent intent; + m_parser->parse(QStringLiteral("下载里的图片"), intent); + QCOMPARE(intent.searchDirectories.size(), 1); + QCOMPARE(intent.searchDirectories.first(), downloadPath); + QVERIFY(intent.fileExtensions.contains("jpg")); + QVERIFY(intent.keywords.isEmpty()); +} + +void tst_ChineseNLP::location_documentsDir() +{ + // "文档目录里的报告" → location(文档目录) + keyword(报告) + const QString docsPath = QStandardPaths::writableLocation(QStandardPaths::DocumentsLocation); + ParsedIntent intent; + m_parser->parse(QStringLiteral("文档目录里的报告"), intent); + QCOMPARE(intent.searchDirectories.size(), 1); + QCOMPARE(intent.searchDirectories.first(), docsPath); +} + +void tst_ChineseNLP::location_picturesDir() +{ + // "图片文件夹里的照片" → location(图片文件夹) + filetype(图片) + const QString picsPath = QStandardPaths::writableLocation(QStandardPaths::PicturesLocation); + ParsedIntent intent; + m_parser->parse(QStringLiteral("图片文件夹里的照片"), intent); + QCOMPARE(intent.searchDirectories.size(), 1); + QCOMPARE(intent.searchDirectories.first(), picsPath); + QVERIFY(intent.fileExtensions.contains("jpg")); +} + +void tst_ChineseNLP::location_musicDir() +{ + // "音乐目录里的歌曲" → location(音乐目录) + const QString musicPath = QStandardPaths::writableLocation(QStandardPaths::MusicLocation); + ParsedIntent intent; + m_parser->parse(QStringLiteral("音乐目录里的歌曲"), intent); + QCOMPARE(intent.searchDirectories.size(), 1); + QCOMPARE(intent.searchDirectories.first(), musicPath); +} + +void tst_ChineseNLP::location_videosDir() +{ + // "视频目录下的电影" → location(视频目录) + const QString videosPath = QStandardPaths::writableLocation(QStandardPaths::MoviesLocation); + ParsedIntent intent; + m_parser->parse(QStringLiteral("视频目录下的电影"), intent); + QCOMPARE(intent.searchDirectories.size(), 1); + QCOMPARE(intent.searchDirectories.first(), videosPath); +} + +void tst_ChineseNLP::location_trash() +{ + // "回收站里的文件" → location(trash) + includeHidden + filetype(文件=文档类) + const QString trashPath = QDir::homePath() + "/.local/share/Trash/files"; + ParsedIntent intent; + m_parser->parse(QStringLiteral("回收站里的文件"), intent); + QCOMPARE(intent.searchDirectories.size(), 1); + QCOMPARE(intent.searchDirectories.first(), trashPath); + QVERIFY(intent.includeHidden); + // "文件" matches filetype_document_general, so it's a filetype not a keyword + QVERIFY(!intent.fileExtensions.isEmpty()); + QVERIFY(intent.fileExtensions.contains("doc")); + QVERIFY(intent.keywords.isEmpty()); +} + +void tst_ChineseNLP::location_deleted() +{ + // "昨天删除的音乐" → location(trash) + time(yesterday) + filetype(音乐) + const QString trashPath = QDir::homePath() + "/.local/share/Trash/files"; + ParsedIntent intent; + m_parser->parse(QStringLiteral("昨天删除的音乐"), intent); + QCOMPARE(intent.searchDirectories.size(), 1); + QCOMPARE(intent.searchDirectories.first(), trashPath); + QVERIFY(intent.includeHidden); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::Yesterday); + QVERIFY(intent.fileExtensions.contains("mp3")); +} + +void tst_ChineseNLP::location_noLocation() +{ + // "今天的文档" → no location, default behavior unchanged + ParsedIntent intent; + m_parser->parse(QStringLiteral("今天的文档"), intent); + QVERIFY(intent.searchDirectories.isEmpty()); + QVERIFY(!intent.includeHidden); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::Today); +} + +void tst_ChineseNLP::location_desktopAndDownload() +{ + // "桌面和下载的图片" → location(桌面,下载) + filetype(图片) + const QString desktopPath = QStandardPaths::writableLocation(QStandardPaths::DesktopLocation); + const QString downloadPath = QStandardPaths::writableLocation(QStandardPaths::DownloadLocation); + ParsedIntent intent; + m_parser->parse(QStringLiteral("桌面和下载的图片"), intent); + QCOMPARE(intent.searchDirectories.size(), 2); + QVERIFY(intent.searchDirectories.contains(desktopPath)); + QVERIFY(intent.searchDirectories.contains(downloadPath)); + QVERIFY(intent.fileExtensions.contains("jpg")); +} + QObject *create_tst_ChineseNLP() { return new tst_ChineseNLP(); diff --git a/include/dfm-search/dfm-search/semantic_types.h b/include/dfm-search/dfm-search/semantic_types.h index 7a591e3e..f68a5f70 100644 --- a/include/dfm-search/dfm-search/semantic_types.h +++ b/include/dfm-search/dfm-search/semantic_types.h @@ -92,6 +92,8 @@ struct ParsedIntent TimeConstraint timeConstraint; SizeConstraint sizeConstraint; QStringList fileExtensions; + QStringList searchDirectories; // Absolute paths resolved from location words + bool includeHidden = false; // true for trash (hidden directory) QStringList keywords; QList consumedSpans; }; diff --git a/src/dfm-search/dfm-search-lib/semantic/extractors/locationextractor.cpp b/src/dfm-search/dfm-search-lib/semantic/extractors/locationextractor.cpp new file mode 100644 index 00000000..854d8ef6 --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/extractors/locationextractor.cpp @@ -0,0 +1,91 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "locationextractor.h" + +#include "../semanticruleengine.h" + +#include +#include + +DFM_SEARCH_BEGIN_NS + +LocationExtractor::LocationExtractor(SemanticRuleEngine *engine) + : m_engine(engine) +{ +} + +LocationExtractor::~LocationExtractor() = default; + +void LocationExtractor::extract(const QString &input, ParsedIntent &intent) +{ + if (!m_engine->hasGroup("location")) { + return; + } + + // Use matchAll to support multiple directory mentions (e.g., "桌面和下载的图片") + QStringList ruleIds; + const QList matches = m_engine->matchAll("location", input, &ruleIds); + + for (int i = 0; i < matches.size(); ++i) { + const QRegularExpressionMatch &m = matches[i]; + const QVariantMap metadata = m_engine->ruleMetadata("location", ruleIds[i]); + + const QString xdgType = metadata.value("xdg_type").toString(); + const bool includeHidden = metadata.value("include_hidden", false).toBool(); + + const QString path = resolveXdgPath(xdgType); + if (path.isEmpty()) { + continue; + } + + if (!intent.searchDirectories.contains(path)) { + intent.searchDirectories.append(path); + } + + if (includeHidden) { + intent.includeHidden = true; + } + + MatchSpan span; + span.start = m.capturedStart(); + span.end = m.capturedEnd(); + span.ruleId = ruleIds[i]; + intent.consumedSpans.append(span); + } +} + +QString LocationExtractor::resolveXdgPath(const QString &xdgType) +{ + if (xdgType == QLatin1String("desktop")) { + return QStandardPaths::writableLocation(QStandardPaths::DesktopLocation); + } + if (xdgType == QLatin1String("download")) { + return QStandardPaths::writableLocation(QStandardPaths::DownloadLocation); + } + if (xdgType == QLatin1String("documents")) { + return QStandardPaths::writableLocation(QStandardPaths::DocumentsLocation); + } + if (xdgType == QLatin1String("pictures")) { + return QStandardPaths::writableLocation(QStandardPaths::PicturesLocation); + } + if (xdgType == QLatin1String("music")) { + return QStandardPaths::writableLocation(QStandardPaths::MusicLocation); + } + if (xdgType == QLatin1String("movies")) { + return QStandardPaths::writableLocation(QStandardPaths::MoviesLocation); + } + if (xdgType == QLatin1String("trash")) { + return QDir::homePath() + QLatin1String("/.local/share/Trash/files"); + } + + return {}; +} + +QString LocationExtractor::name() const +{ + return QStringLiteral("location"); +} + +DFM_SEARCH_END_NS diff --git a/src/dfm-search/dfm-search-lib/semantic/extractors/locationextractor.h b/src/dfm-search/dfm-search-lib/semantic/extractors/locationextractor.h new file mode 100644 index 00000000..cd1104c3 --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/extractors/locationextractor.h @@ -0,0 +1,37 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef LOCATIONEXTRACTOR_H +#define LOCATIONEXTRACTOR_H + +#include + +DFM_SEARCH_BEGIN_NS + +class SemanticRuleEngine; + +class LocationExtractor : public DimensionExtractor +{ +public: + explicit LocationExtractor(SemanticRuleEngine *engine); + ~LocationExtractor() override; + + void extract(const QString &input, ParsedIntent &intent) override; + QString name() const override; + + /** + * @brief Resolve an XDG type string to an absolute filesystem path. + * @param xdgType One of: "desktop", "download", "documents", "pictures", + * "music", "movies", "trash" + * @return The resolved absolute path, or empty string if unknown + */ + static QString resolveXdgPath(const QString &xdgType); + +private: + SemanticRuleEngine *m_engine; +}; + +DFM_SEARCH_END_NS + +#endif // LOCATIONEXTRACTOR_H diff --git a/src/dfm-search/dfm-search-lib/semantic/intentparser.cpp b/src/dfm-search/dfm-search-lib/semantic/intentparser.cpp index 97abfae8..fdcd9bc3 100644 --- a/src/dfm-search/dfm-search-lib/semantic/intentparser.cpp +++ b/src/dfm-search/dfm-search-lib/semantic/intentparser.cpp @@ -8,6 +8,7 @@ #include "extractors/actionextractor.h" #include "extractors/filetypeextractor.h" #include "extractors/keywordextractor.h" +#include "extractors/locationextractor.h" #include "extractors/sizeextractor.h" #include "extractors/timeextractor.h" @@ -50,6 +51,7 @@ void IntentParser::initDefaultExtractors() addExtractor(std::make_unique(m_engine)); addExtractor(std::make_unique(m_engine)); addExtractor(std::make_unique(m_engine)); + addExtractor(std::make_unique(m_engine)); addExtractor(std::make_unique(m_engine)); } diff --git a/src/dfm-search/dfm-search-lib/semantic/ruleconfigloader.cpp b/src/dfm-search/dfm-search-lib/semantic/ruleconfigloader.cpp index 46a4e656..39d78355 100644 --- a/src/dfm-search/dfm-search-lib/semantic/ruleconfigloader.cpp +++ b/src/dfm-search/dfm-search-lib/semantic/ruleconfigloader.cpp @@ -63,7 +63,8 @@ QStringList RuleConfigLoader::ruleFileNames() return {"noise_rules.json", "time_rules.json", "filetype_rules.json", - "keyword_rules.json"}; + "keyword_rules.json", + "location_rules.json"}; } QString RuleConfigLoader::resolveLocaleDir(const QString &baseDir) diff --git a/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/location_rules.json b/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/location_rules.json new file mode 100644 index 00000000..8ea723ac --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/location_rules.json @@ -0,0 +1,100 @@ +{ + "version": "1.0.0", + "groups": [ + { + "name": "location", + "version": "1.0.0", + "locale": "zh-CN", + "rules": [ + { + "id": "loc_desktop", + "pattern": "桌面", + "description": "Desktop directory", + "enabled": true, + "priority": 200, + "metadata": { + "xdg_type": "desktop", + "include_hidden": false + } + }, + { + "id": "loc_download", + "pattern": "下载", + "description": "Downloads directory", + "enabled": true, + "priority": 200, + "metadata": { + "xdg_type": "download", + "include_hidden": false + } + }, + { + "id": "loc_documents_dir", + "pattern": "文档目录|文档文件夹", + "description": "Documents directory (must contain directory/folder word)", + "enabled": true, + "priority": 200, + "metadata": { + "xdg_type": "documents", + "include_hidden": false + } + }, + { + "id": "loc_pictures_dir", + "pattern": "图片目录|图片文件夹|照片目录|照片文件夹", + "description": "Pictures directory (must contain directory/folder word)", + "enabled": true, + "priority": 200, + "metadata": { + "xdg_type": "pictures", + "include_hidden": false + } + }, + { + "id": "loc_music_dir", + "pattern": "音乐目录|音乐文件夹", + "description": "Music directory (must contain directory/folder word)", + "enabled": true, + "priority": 200, + "metadata": { + "xdg_type": "music", + "include_hidden": false + } + }, + { + "id": "loc_videos_dir", + "pattern": "视频目录|视频文件夹|电影目录|电影文件夹", + "description": "Videos directory (must contain directory/folder word)", + "enabled": true, + "priority": 200, + "metadata": { + "xdg_type": "movies", + "include_hidden": false + } + }, + { + "id": "loc_trash", + "pattern": "回收站|垃圾箱", + "description": "Trash directory", + "enabled": true, + "priority": 200, + "metadata": { + "xdg_type": "trash", + "include_hidden": true + } + }, + { + "id": "loc_deleted", + "pattern": "删除的|删除掉的", + "description": "Deleted files (maps to trash)", + "enabled": true, + "priority": 200, + "metadata": { + "xdg_type": "trash", + "include_hidden": true + } + } + ] + } + ] +} diff --git a/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/noise_rules.json b/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/noise_rules.json index d247f052..ce0a2d70 100644 --- a/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/noise_rules.json +++ b/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/noise_rules.json @@ -37,6 +37,14 @@ "enabled": true, "priority": 70, "metadata": {} + }, + { + "id": "noise_location_connector", + "pattern": "上的|里的|下的", + "description": "Location connector words to consume", + "enabled": true, + "priority": 190, + "metadata": {} } ] } diff --git a/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.cpp b/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.cpp index 0262c470..fca6c078 100644 --- a/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.cpp +++ b/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.cpp @@ -19,6 +19,10 @@ SemanticSearchPlan SemanticQueryBuilder::build(const ParsedIntent &intent) { SemanticSearchPlan plan; + // Pass location info through to plan (searcher handles per-directory options) + plan.searchDirectories = intent.searchDirectories; + plan.includeHidden = intent.includeHidden; + // Determine time field strategy if (intent.timeConstraint.isValid() && intent.timeConstraint.timeField == TimeField::Unspecified) { // Time constraint exists but no action specified → search both birth and modify time diff --git a/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.h b/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.h index 276a694c..824f252b 100644 --- a/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.h +++ b/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.h @@ -25,6 +25,8 @@ struct SemanticSearchPlan { std::optional ocrQuery; std::optional ocrOptions; TimeField timeField = TimeField::ModifyTime; // BirthTime, ModifyTime, or Both + QStringList searchDirectories; // Empty = use default homePath + bool includeHidden = false; // For trash directory }; /** diff --git a/src/dfm-search/dfm-search-lib/semantic/semanticsearcher.cpp b/src/dfm-search/dfm-search-lib/semantic/semanticsearcher.cpp index 206fa8c6..d96bb08b 100644 --- a/src/dfm-search/dfm-search-lib/semantic/semanticsearcher.cpp +++ b/src/dfm-search/dfm-search-lib/semantic/semanticsearcher.cpp @@ -11,6 +11,7 @@ #include #include +#include #include #include @@ -71,6 +72,11 @@ void SemanticSearcherData::doSearch(const QString &naturalLanguage) timeFields = {plan.timeField}; } + // Step 3b: Determine search directories + QStringList dirs = plan.searchDirectories.isEmpty() + ? QStringList{QDir::homePath()} + : plan.searchDirectories; + // Helper: apply time field to options (clone + setTimeField if time filter present) auto applyTimeField = [](const SearchOptions &opts, TimeField tf) -> SearchOptions { TimeRangeFilter tfCopy = opts.timeRangeFilter(); @@ -129,48 +135,68 @@ void SemanticSearcherData::doSearch(const QString &naturalLanguage) } engines.clear(); - // Launch engines for each time field (may be 1 or 2 for Both) - for (TimeField tf : timeFields) { - // File name search (always) - if (Global::isFileNameIndexReadyForSearch()) { - SearchEngine *engine = SearchEngine::create(SearchType::FileName, q); - engine->setSearchOptions(applyTimeField(plan.fileNameOptions, tf)); + // Launch engines for each directory and each time field + for (const QString &dir : dirs) { + for (TimeField tf : timeFields) { + // File name search (always) + if (Global::isFileNameIndexReadyForSearch()) { + SearchOptions fnameOpts = applyTimeField(plan.fileNameOptions, tf); + fnameOpts.setSearchPath(dir); + if (plan.includeHidden) { + fnameOpts.setIncludeHidden(true); + } + + SearchEngine *engine = SearchEngine::create(SearchType::FileName, q); + engine->setSearchOptions(fnameOpts); + + QObject::connect(engine, &SearchEngine::resultsFound, q, onResultsFound); + QObject::connect(engine, &SearchEngine::searchFinished, q, onFinished); + QObject::connect(engine, &SearchEngine::errorOccurred, q, onError); + + engines.append(engine); + pendingFinishCount.fetch_add(1); + engine->search(plan.fileNameQuery); + } - QObject::connect(engine, &SearchEngine::resultsFound, q, onResultsFound); - QObject::connect(engine, &SearchEngine::searchFinished, q, onFinished); - QObject::connect(engine, &SearchEngine::errorOccurred, q, onError); + // Content search + if (plan.contentQuery.has_value() && plan.contentOptions.has_value()) { + SearchOptions contentOpts = applyTimeField(*plan.contentOptions, tf); + contentOpts.setSearchPath(dir); + if (plan.includeHidden) { + contentOpts.setIncludeHidden(true); + } - engines.append(engine); - pendingFinishCount.fetch_add(1); - engine->search(plan.fileNameQuery); - } + SearchEngine *engine = SearchEngine::create(SearchType::Content, q); + engine->setSearchOptions(contentOpts); - // Content search - if (plan.contentQuery.has_value() && plan.contentOptions.has_value()) { - SearchEngine *engine = SearchEngine::create(SearchType::Content, q); - engine->setSearchOptions(applyTimeField(*plan.contentOptions, tf)); + QObject::connect(engine, &SearchEngine::resultsFound, q, onResultsFound); + QObject::connect(engine, &SearchEngine::searchFinished, q, onFinished); + QObject::connect(engine, &SearchEngine::errorOccurred, q, onError); - QObject::connect(engine, &SearchEngine::resultsFound, q, onResultsFound); - QObject::connect(engine, &SearchEngine::searchFinished, q, onFinished); - QObject::connect(engine, &SearchEngine::errorOccurred, q, onError); + engines.append(engine); + pendingFinishCount.fetch_add(1); + engine->search(*plan.contentQuery); + } - engines.append(engine); - pendingFinishCount.fetch_add(1); - engine->search(*plan.contentQuery); - } + // OCR search + if (plan.ocrQuery.has_value() && plan.ocrOptions.has_value()) { + SearchOptions ocrOpts = applyTimeField(*plan.ocrOptions, tf); + ocrOpts.setSearchPath(dir); + if (plan.includeHidden) { + ocrOpts.setIncludeHidden(true); + } - // OCR search - if (plan.ocrQuery.has_value() && plan.ocrOptions.has_value()) { - SearchEngine *engine = SearchEngine::create(SearchType::Ocr, q); - engine->setSearchOptions(applyTimeField(*plan.ocrOptions, tf)); + SearchEngine *engine = SearchEngine::create(SearchType::Ocr, q); + engine->setSearchOptions(ocrOpts); - QObject::connect(engine, &SearchEngine::resultsFound, q, onResultsFound); - QObject::connect(engine, &SearchEngine::searchFinished, q, onFinished); - QObject::connect(engine, &SearchEngine::errorOccurred, q, onError); + QObject::connect(engine, &SearchEngine::resultsFound, q, onResultsFound); + QObject::connect(engine, &SearchEngine::searchFinished, q, onFinished); + QObject::connect(engine, &SearchEngine::errorOccurred, q, onError); - engines.append(engine); - pendingFinishCount.fetch_add(1); - engine->search(*plan.ocrQuery); + engines.append(engine); + pendingFinishCount.fetch_add(1); + engine->search(*plan.ocrQuery); + } } } From 9c6f7b84507155bc7de330052a715e9a927e4ff0 Mon Sep 17 00:00:00 2001 From: Zhang Sheng Date: Thu, 14 May 2026 21:34:11 +0800 Subject: [PATCH 09/36] feat: add semantic query detection and multi-path search support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Added isSemanticQuery() API to check for semantic intent in search queries 2. Implemented multiple path search support via setSearchPaths() interface 3. Added test cases for isSemanticQuery functionality 4. Optimized path prefix queries for multi-path scenarios 5. Improved handling of TimeField::Both in time range filters 6. Restructured query building to avoid duplicate engines creation Log: Improved time range filter handling for both creation and modification times Influence: 1. Test semantic query detection with various inputs (time, size, file types, locations) 2. Verify multi-path search functionality with different path combinations 3. Test path prefix optimization with single and multiple paths 4. Verify time range filtering with TimeField::Both option 5. Test edge cases in isSemanticQuery (empty, whitespace, pure keywords) feat: 添加语义查询检测和多路径搜索支持 1. 添加isSemanticQuery()接口用于检测搜索查询中的语义意图 2. 通过setSearchPaths()接口实现多路径搜索支持 3. 新增isSemanticQuery功能测试用例 4. 针对多路径场景优化路径前缀查询 5. 改进时间范围过滤器中TimeField::Both的处理 6. 重构查询构建逻辑,避免创建重复的搜索引擎 Log: 改进创建时间和修改时间的范围过滤处理 Influence: 1. 测试语义查询检测功能,使用不同输入(时间、大小、文件类型、路径) 2. 验证多路径搜索功能的正确性,测试不同路径组合 3. 测试单路径和多路径下的路径前缀优化 4. 验证TimeField::Both选项下的时间范围过滤效果 5. 测试isSemanticQuery的边界情况(空输入、纯空格、纯关键词等) --- autotests/dfm-search-tests/main.cpp | 5 + .../dfm-search-tests/tst_semantic_search.cpp | 252 ++++++++++++++++++ include/dfm-search/dfm-search/searchoptions.h | 27 ++ .../dfm-search/dfm-search/semanticsearcher.h | 14 + .../dfm-search/dfm-search/sizerangefilter.h | 2 - .../contentstrategies/indexedstrategy.cpp | 51 ++-- .../dfm-search-lib/core/searchoptions.cpp | 23 ++ .../dfm-search-lib/core/searchoptionsdata.h | 1 + .../filenamestrategies/indexedstrategy.cpp | 47 ++-- .../ocrtextstrategies/indexedstrategy.cpp | 51 ++-- .../semantic/extractors/actionextractor.cpp | 2 +- .../semantic/extractors/locationextractor.cpp | 2 +- .../semantic/extractors/sizeextractor.cpp | 2 +- .../semantic/extractors/timeextractor.cpp | 24 +- .../semantic/semanticquerybuilder.cpp | 7 +- .../semantic/semanticsearcher.cpp | 175 ++++++------ .../semantic/semanticsearcher_p.h | 32 ++- .../dfm-search-lib/utils/lucenequeryutils.cpp | 26 ++ .../dfm-search-lib/utils/lucenequeryutils.h | 12 + .../dfm-search-lib/utils/timerangeutils.cpp | 46 ++++ .../dfm-search-lib/utils/timerangeutils.h | 21 ++ 21 files changed, 635 insertions(+), 187 deletions(-) diff --git a/autotests/dfm-search-tests/main.cpp b/autotests/dfm-search-tests/main.cpp index 31282df7..88092b03 100644 --- a/autotests/dfm-search-tests/main.cpp +++ b/autotests/dfm-search-tests/main.cpp @@ -16,6 +16,7 @@ extern QObject *create_tst_KeywordExtraction(); extern QObject *create_tst_ParsedIntent(); extern QObject *create_tst_ChineseNLP(); extern QObject *create_tst_SizeRangeFilter(); +extern QObject *create_tst_IsSemanticQuery(); int main(int argc, char *argv[]) { @@ -66,5 +67,9 @@ int main(int argc, char *argv[]) result |= QTest::qExec(testObj11, argc, argv); delete testObj11; + QObject *testObj12 = create_tst_IsSemanticQuery(); + result |= QTest::qExec(testObj12, argc, argv); + delete testObj12; + return result; } diff --git a/autotests/dfm-search-tests/tst_semantic_search.cpp b/autotests/dfm-search-tests/tst_semantic_search.cpp index 0aa6ae3c..56de2e3a 100644 --- a/autotests/dfm-search-tests/tst_semantic_search.cpp +++ b/autotests/dfm-search-tests/tst_semantic_search.cpp @@ -7,9 +7,13 @@ #include #include #include +#include + +#include #include "semantic/semanticruleengine.h" #include "semantic/intentparser.h" +#include "semantic/ruleconfigloader.h" using namespace DFMSEARCH; @@ -653,6 +657,253 @@ void tst_ParsedIntent::matchSpanValidity() QVERIFY(span.isValid()); } +// ===== tst_IsSemanticQuery ===== + +namespace { + +// Resolve the source tree rule directory relative to TEST_SOURCE_DIR. +// Falls back to a heuristic path if TEST_SOURCE_DIR is not defined. +QString sourceRulesDir() +{ + QString base = QString::fromUtf8(TEST_SOURCE_DIR); + if (base.isEmpty()) { + base = QCoreApplication::applicationDirPath() + "/../../.."; + } + return base + "/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN"; +} + +// Check whether the source tree rule files exist and are loadable. +bool sourceRulesAvailable() +{ + const QString dir = sourceRulesDir(); + for (const QString &filename : RuleConfigLoader::ruleFileNames()) { + if (!QFile::exists(dir + "/" + filename)) { + return false; + } + } + return true; +} + +// Replicate isSemanticQuery() logic using internal components with source-tree rules. +bool checkIsSemanticQuery(SemanticRuleEngine *engine, IntentParser *parser, + const QString &input) +{ + if (input.trimmed().isEmpty()) { + return false; + } + + ParsedIntent intent; + parser->parse(input, intent); + + return intent.timeConstraint.isValid() + || intent.sizeConstraint.isValid() + || !intent.fileExtensions.isEmpty() + || !intent.searchDirectories.isEmpty(); +} + +} // namespace + +class tst_IsSemanticQuery : public QObject +{ + Q_OBJECT + +private Q_SLOTS: + void initTestCase(); + void emptyInput(); + void whitespaceOnly(); + void plainKeyword(); + void plainChineseKeyword(); + void todayKeyword(); + void yesterdayKeyword(); + void thisWeekKeyword(); + void lastMonthKeyword(); + void fileTypePdf(); + void fileTypeImage(); + void fileTypeDocument(); + void locationDesktop(); + void locationDownloads(); + void locationTrash(); + void sizeLarge(); + void sizeSmall(); + void sizeDynamic(); + void timeAndFileType(); + void locationAndTime(); + void keywordOnlyNoMatch(); + void consecutiveCalls(); + void noiseWordsOnly(); + +private: + SemanticRuleEngine *m_engine = nullptr; + IntentParser *m_parser = nullptr; +}; + +void tst_IsSemanticQuery::initTestCase() +{ + if (!sourceRulesAvailable()) { + QSKIP("Rule files not found in source tree, skipping isSemanticQuery tests"); + } + + m_engine = new SemanticRuleEngine(this); + const QString dir = sourceRulesDir(); + QStringList ruleFiles = RuleConfigLoader::ruleFileNames(); + // size_rules.json is not yet registered in RuleConfigLoader::ruleFileNames(), + // but the IntentParser includes a SizeExtractor. Load it explicitly. + if (!ruleFiles.contains("size_rules.json")) { + ruleFiles.append("size_rules.json"); + } + for (const QString &filename : std::as_const(ruleFiles)) { + QString path = dir + "/" + filename; + if (!m_engine->loadRuleFile(path)) { + qWarning() << "Failed to load rule file:" << path; + } + } + + m_parser = new IntentParser(m_engine); +} + +void tst_IsSemanticQuery::emptyInput() +{ + QVERIFY(!checkIsSemanticQuery(m_engine, m_parser, QString())); +} + +void tst_IsSemanticQuery::whitespaceOnly() +{ + QVERIFY(!checkIsSemanticQuery(m_engine, m_parser, " ")); + QVERIFY(!checkIsSemanticQuery(m_engine, m_parser, "\t\n")); +} + +void tst_IsSemanticQuery::plainKeyword() +{ + QVERIFY(!checkIsSemanticQuery(m_engine, m_parser, "hello")); + QVERIFY(!checkIsSemanticQuery(m_engine, m_parser, "meeting notes")); +} + +void tst_IsSemanticQuery::plainChineseKeyword() +{ + // Pure Chinese text without any semantic triggers. + // Avoid words that match filetype/location/time/size rules + // (e.g. "报告" matches filetype_document_general, "音乐" matches filetype_audio). + QVERIFY(!checkIsSemanticQuery(m_engine, m_parser, "蓝天白云")); + QVERIFY(!checkIsSemanticQuery(m_engine, m_parser, "春夏秋冬")); +} + +void tst_IsSemanticQuery::todayKeyword() +{ + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "今天的文件")); + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "今日份报告")); +} + +void tst_IsSemanticQuery::yesterdayKeyword() +{ + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "昨天的报告")); + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "昨晚的截图")); +} + +void tst_IsSemanticQuery::thisWeekKeyword() +{ + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "本周的文档")); + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "这周修改的")); +} + +void tst_IsSemanticQuery::lastMonthKeyword() +{ + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "上个月的文件")); +} + +void tst_IsSemanticQuery::fileTypePdf() +{ + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "pdf文档")); + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "找一下pdf")); +} + +void tst_IsSemanticQuery::fileTypeImage() +{ + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "图片")); + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "截图")); +} + +void tst_IsSemanticQuery::fileTypeDocument() +{ + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "文档")); + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "报告")); +} + +void tst_IsSemanticQuery::locationDesktop() +{ + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "桌面的文件")); +} + +void tst_IsSemanticQuery::locationDownloads() +{ + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "下载的文件")); +} + +void tst_IsSemanticQuery::locationTrash() +{ + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "回收站的文件")); + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "删除的文件")); +} + +void tst_IsSemanticQuery::sizeLarge() +{ + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "大文件")); + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "几个G的文件")); +} + +void tst_IsSemanticQuery::sizeSmall() +{ + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "小文件")); +} + +void tst_IsSemanticQuery::sizeDynamic() +{ + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "大于500M的文件")); + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "小于100K")); +} + +void tst_IsSemanticQuery::timeAndFileType() +{ + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "今天的pdf")); + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "本周的图片")); +} + +void tst_IsSemanticQuery::locationAndTime() +{ + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "桌面今天的文件")); +} + +void tst_IsSemanticQuery::keywordOnlyNoMatch() +{ + // Text that does not match any semantic rule pattern + QVERIFY(!checkIsSemanticQuery(m_engine, m_parser, "xyzabc123")); + QVERIFY(!checkIsSemanticQuery(m_engine, m_parser, "随便什么文字")); +} + +void tst_IsSemanticQuery::consecutiveCalls() +{ + // Multiple calls with the same input should return consistent results + QString input = "今天的pdf"; + bool first = checkIsSemanticQuery(m_engine, m_parser, input); + bool second = checkIsSemanticQuery(m_engine, m_parser, input); + bool third = checkIsSemanticQuery(m_engine, m_parser, input); + QCOMPARE(first, second); + QCOMPARE(second, third); + QVERIFY(first); + + QString plain = "hello world"; + bool p1 = checkIsSemanticQuery(m_engine, m_parser, plain); + bool p2 = checkIsSemanticQuery(m_engine, m_parser, plain); + QCOMPARE(p1, p2); + QVERIFY(!p1); +} + +void tst_IsSemanticQuery::noiseWordsOnly() +{ + // Noise words alone (search action words) without any semantic dimension + QVERIFY(!checkIsSemanticQuery(m_engine, m_parser, "搜索")); + QVERIFY(!checkIsSemanticQuery(m_engine, m_parser, "查找")); +} + // ===== Factory functions ===== QObject *create_tst_RuleEngine() { return new tst_RuleEngine(); } @@ -660,5 +911,6 @@ QObject *create_tst_TimeExtraction() { return new tst_TimeExtraction(); } QObject *create_tst_FileTypeExtraction() { return new tst_FileTypeExtraction(); } QObject *create_tst_KeywordExtraction() { return new tst_KeywordExtraction(); } QObject *create_tst_ParsedIntent() { return new tst_ParsedIntent(); } +QObject *create_tst_IsSemanticQuery() { return new tst_IsSemanticQuery(); } #include "tst_semantic_search.moc" diff --git a/include/dfm-search/dfm-search/searchoptions.h b/include/dfm-search/dfm-search/searchoptions.h index b13f1819..f9ae1dce 100644 --- a/include/dfm-search/dfm-search/searchoptions.h +++ b/include/dfm-search/dfm-search/searchoptions.h @@ -75,14 +75,41 @@ class SearchOptions /** * @brief Get the starting search path + * + * Returns the first path from searchPaths(), or an empty string if no paths are set. + * @return The primary search path */ QString searchPath() const; /** * @brief Set the starting search path + * + * This replaces all search paths with a single path. + * @param path The search path */ void setSearchPath(const QString &path); + /** + * @brief Get all search paths + * + * When multiple paths are set, the search engine will search all of them + * and combine results. Returns a list containing the single searchPath + * if only one path was set via setSearchPath(). + * + * @return List of search paths + */ + QStringList searchPaths() const; + + /** + * @brief Set multiple search paths + * + * When multiple paths are set, search engines that support multi-path + * queries will build combined path prefix queries internally. + * + * @param paths List of search paths + */ + void setSearchPaths(const QStringList &paths); + /** * @brief Returns the current list of excluded search paths. * diff --git a/include/dfm-search/dfm-search/semanticsearcher.h b/include/dfm-search/dfm-search/semanticsearcher.h index 03775085..8c4c85c9 100644 --- a/include/dfm-search/dfm-search/semanticsearcher.h +++ b/include/dfm-search/dfm-search/semanticsearcher.h @@ -71,6 +71,20 @@ class SemanticSearcher : public QObject */ void search(const QString &naturalLanguage); + /** + * @brief Check if the input contains semantic intent beyond a plain keyword. + * + * Returns true if parsing the input reveals time constraints, size constraints, + * file type filters, or location constraints. Returns false for plain keyword input. + * + * This allows callers to avoid unnecessary semantic search overhead when + * the user is just typing a simple keyword. + * + * @param input The natural language query to check + * @return true if the input contains semantic intent, false for plain keywords + */ + bool isSemanticQuery(const QString &input) const; + /** * @brief Perform a synchronous semantic search * diff --git a/include/dfm-search/dfm-search/sizerangefilter.h b/include/dfm-search/dfm-search/sizerangefilter.h index a99cf95b..57d15f02 100644 --- a/include/dfm-search/dfm-search/sizerangefilter.h +++ b/include/dfm-search/dfm-search/sizerangefilter.h @@ -4,8 +4,6 @@ #ifndef SIZERANGEFILTER_H #define SIZERANGEFILTER_H -#include - #include DFM_SEARCH_BEGIN_NS diff --git a/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.cpp b/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.cpp index 1efcc582..12c35690 100644 --- a/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.cpp +++ b/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.cpp @@ -101,34 +101,36 @@ Lucene::QueryPtr ContentIndexedStrategy::buildLuceneQuery(const SearchQuery &que } // Add path prefix query optimization - if (mainQuery && SearchUtility::isContentIndexAncestorPathsSupported() - && SearchUtility::shouldUsePathPrefixQuery(searchPath)) { - QueryPtr pathPrefixQuery = LuceneQueryUtils::buildPathPrefixQuery(searchPath, - QString::fromWCharArray(LuceneFieldNames::Content::kAncestorPaths)); - if (pathPrefixQuery) { - BooleanQueryPtr finalQuery = newLucene(); - finalQuery->add(mainQuery, BooleanClause::MUST); - finalQuery->add(pathPrefixQuery, BooleanClause::MUST); - qInfo() << "Using path prefix query for content search optimization:" << searchPath; - mainQuery = finalQuery; + QStringList searchPathsList = m_options.searchPaths(); + if (mainQuery && SearchUtility::isContentIndexAncestorPathsSupported()) { + bool usePrefixQuery = false; + for (const QString &p : searchPathsList) { + if (SearchUtility::shouldUsePathPrefixQuery(p)) { + usePrefixQuery = true; + break; + } + } + if (usePrefixQuery) { + QueryPtr pathPrefixQuery = LuceneQueryUtils::buildMultiPathPrefixQuery( + searchPathsList, + QString::fromWCharArray(LuceneFieldNames::Content::kAncestorPaths)); + if (pathPrefixQuery) { + BooleanQueryPtr finalQuery = newLucene(); + finalQuery->add(mainQuery, BooleanClause::MUST); + finalQuery->add(pathPrefixQuery, BooleanClause::MUST); + qInfo() << "Using multi-path prefix query for content search optimization:" << searchPathsList; + mainQuery = finalQuery; + } } } // Add time range filter query if (m_options.hasTimeRangeFilter()) { TimeRangeFilter filter = m_options.timeRangeFilter(); - auto [start, end] = filter.resolveTimeRange(); - - qint64 startEpoch = TimeRangeUtils::toEpochSecs(start); - qint64 endEpoch = TimeRangeUtils::toEpochSecs(end); - - const wchar_t *fieldName = (filter.timeField() == TimeField::BirthTime) - ? LuceneFieldNames::Content::kBirthTime - : LuceneFieldNames::Content::kModifyTime; - - QueryPtr timeQuery = TimeRangeUtils::buildNumericRangeQuery( - fieldName, startEpoch, endEpoch, - filter.includeLower(), filter.includeUpper()); + QueryPtr timeQuery = TimeRangeUtils::buildTimeRangeFilterQuery( + filter, + LuceneFieldNames::Content::kBirthTime, + LuceneFieldNames::Content::kModifyTime); if (timeQuery) { if (mainQuery) { @@ -280,6 +282,7 @@ void ContentIndexedStrategy::processSearchResults(const Lucene::IndexSearcherPtr QString searchPath = m_options.searchPath(); const QStringList &searchExcludedPaths = m_options.searchExcludedPaths(); + QStringList allSearchPaths = m_options.searchPaths(); auto docsSize = scoreDocs.size(); ContentOptionsAPI optAPI(m_options); @@ -337,7 +340,9 @@ void ContentIndexedStrategy::processSearchResults(const Lucene::IndexSearcherPtr QString path = QString::fromStdWString(pathField); - if (!path.startsWith(searchPath)) { + // Check against all search paths + if (!std::any_of(allSearchPaths.cbegin(), allSearchPaths.cend(), + [&path](const auto &sp) { return path.startsWith(sp); })) { continue; } diff --git a/src/dfm-search/dfm-search-lib/core/searchoptions.cpp b/src/dfm-search/dfm-search-lib/core/searchoptions.cpp index 2657a28f..07376898 100644 --- a/src/dfm-search/dfm-search-lib/core/searchoptions.cpp +++ b/src/dfm-search/dfm-search-lib/core/searchoptions.cpp @@ -78,12 +78,35 @@ void SearchOptions::setCaseSensitive(bool sensitive) QString SearchOptions::searchPath() const { + if (!d->searchPathsList.isEmpty()) { + return d->searchPathsList.first(); + } return d->searchPath; } void SearchOptions::setSearchPath(const QString &path) { d->searchPath = path; + d->searchPathsList.clear(); +} + +QStringList SearchOptions::searchPaths() const +{ + if (!d->searchPathsList.isEmpty()) { + return d->searchPathsList; + } + if (!d->searchPath.isEmpty()) { + return QStringList { d->searchPath }; + } + return {}; +} + +void SearchOptions::setSearchPaths(const QStringList &paths) +{ + d->searchPathsList = paths; + if (!paths.isEmpty()) { + d->searchPath = paths.first(); + } } QStringList SearchOptions::searchExcludedPaths() const diff --git a/src/dfm-search/dfm-search-lib/core/searchoptionsdata.h b/src/dfm-search/dfm-search-lib/core/searchoptionsdata.h index 2ae5e77c..0f99ed57 100644 --- a/src/dfm-search/dfm-search-lib/core/searchoptionsdata.h +++ b/src/dfm-search/dfm-search-lib/core/searchoptionsdata.h @@ -31,6 +31,7 @@ class SearchOptionsData SearchMethod method; ///< The search method to use bool caseSensitive; ///< Whether search is case sensitive QString searchPath; ///< The path to search in + QStringList searchPathsList; ///< Multiple search paths QStringList searchExcludedPaths; ///< excluded search paths. bool includeHidden; ///< Whether to include hidden files int maxResults; ///< Maximum number of results to return diff --git a/src/dfm-search/dfm-search-lib/filenamesearch/filenamestrategies/indexedstrategy.cpp b/src/dfm-search/dfm-search-lib/filenamesearch/filenamestrategies/indexedstrategy.cpp index 5adcbdd2..bb8c606d 100644 --- a/src/dfm-search/dfm-search-lib/filenamesearch/filenamestrategies/indexedstrategy.cpp +++ b/src/dfm-search/dfm-search-lib/filenamesearch/filenamestrategies/indexedstrategy.cpp @@ -522,6 +522,9 @@ void FileNameIndexedStrategy::executeIndexQuery(const IndexQuery &query, const Q auto docsSize = scoreDocs.size(); m_results.reserve(docsSize); + // Get all search paths for post-filtering + QStringList allSearchPaths = m_options.searchPaths(); + // 实时处理搜索结果 for (int i = 0; i < docsSize; i++) { if (m_cancelled.load()) { @@ -534,7 +537,9 @@ void FileNameIndexedStrategy::executeIndexQuery(const IndexQuery &query, const Q DocumentPtr doc = searcher->doc(scoreDoc->doc); QString path = QString::fromStdWString(doc->get(LuceneFieldNames::FileName::kFullPath)); - if (!path.startsWith(searchPath)) { + // Check against all search paths + if (!std::any_of(allSearchPaths.cbegin(), allSearchPaths.cend(), + [&path](const auto &sp) { return path.startsWith(sp); })) { continue; } @@ -748,18 +753,10 @@ Lucene::QueryPtr FileNameIndexedStrategy::buildLuceneQuery(const IndexQuery &que // Add time range filter query if (m_options.hasTimeRangeFilter()) { TimeRangeFilter filter = m_options.timeRangeFilter(); - auto [start, end] = filter.resolveTimeRange(); - - qint64 startEpoch = TimeRangeUtils::toEpochSecs(start); - qint64 endEpoch = TimeRangeUtils::toEpochSecs(end); - - const wchar_t *fieldName = (filter.timeField() == TimeField::BirthTime) - ? LuceneFieldNames::FileName::kBirthTime - : LuceneFieldNames::FileName::kModifyTime; - - QueryPtr timeQuery = TimeRangeUtils::buildNumericRangeQuery( - fieldName, startEpoch, endEpoch, - filter.includeLower(), filter.includeUpper()); + QueryPtr timeQuery = TimeRangeUtils::buildTimeRangeFilterQuery( + filter, + LuceneFieldNames::FileName::kBirthTime, + LuceneFieldNames::FileName::kModifyTime); if (timeQuery) { finalQuery->add(timeQuery, BooleanClause::MUST); @@ -783,13 +780,23 @@ Lucene::QueryPtr FileNameIndexedStrategy::buildLuceneQuery(const IndexQuery &que } // Add path prefix query optimization - if (hasValidQuery && SearchUtility::isFilenameIndexAncestorPathsSupported() - && SearchUtility::shouldUsePathPrefixQuery(searchPath)) { - QueryPtr pathPrefixQuery = LuceneQueryUtils::buildPathPrefixQuery(searchPath, - QString::fromWCharArray(LuceneFieldNames::FileName::kAncestorPaths)); - if (pathPrefixQuery) { - finalQuery->add(pathPrefixQuery, BooleanClause::MUST); - qInfo() << "Using path prefix query for optimization:" << searchPath; + QStringList searchPathsList = m_options.searchPaths(); + if (hasValidQuery && SearchUtility::isFilenameIndexAncestorPathsSupported()) { + bool usePrefixQuery = false; + for (const QString &p : searchPathsList) { + if (SearchUtility::shouldUsePathPrefixQuery(p)) { + usePrefixQuery = true; + break; + } + } + if (usePrefixQuery) { + QueryPtr pathPrefixQuery = LuceneQueryUtils::buildMultiPathPrefixQuery( + searchPathsList, + QString::fromWCharArray(LuceneFieldNames::FileName::kAncestorPaths)); + if (pathPrefixQuery) { + finalQuery->add(pathPrefixQuery, BooleanClause::MUST); + qInfo() << "Using multi-path prefix query for optimization:" << searchPathsList; + } } } diff --git a/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.cpp b/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.cpp index 2d146e94..8bde5aab 100644 --- a/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.cpp +++ b/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.cpp @@ -98,34 +98,36 @@ Lucene::QueryPtr OcrTextIndexedStrategy::buildLuceneQuery(const SearchQuery &que } // Add path prefix query optimization - if (mainQuery && SearchUtility::isOcrTextIndexAncestorPathsSupported() - && SearchUtility::shouldUsePathPrefixQuery(searchPath)) { - QueryPtr pathPrefixQuery = LuceneQueryUtils::buildPathPrefixQuery(searchPath, - QString::fromWCharArray(LuceneFieldNames::OcrText::kAncestorPaths)); - if (pathPrefixQuery) { - BooleanQueryPtr finalQuery = newLucene(); - finalQuery->add(mainQuery, BooleanClause::MUST); - finalQuery->add(pathPrefixQuery, BooleanClause::MUST); - qInfo() << "Using path prefix query for OCR text search optimization:" << searchPath; - mainQuery = finalQuery; + QStringList searchPathsList = m_options.searchPaths(); + if (mainQuery && SearchUtility::isOcrTextIndexAncestorPathsSupported()) { + bool usePrefixQuery = false; + for (const QString &p : searchPathsList) { + if (SearchUtility::shouldUsePathPrefixQuery(p)) { + usePrefixQuery = true; + break; + } + } + if (usePrefixQuery) { + QueryPtr pathPrefixQuery = LuceneQueryUtils::buildMultiPathPrefixQuery( + searchPathsList, + QString::fromWCharArray(LuceneFieldNames::OcrText::kAncestorPaths)); + if (pathPrefixQuery) { + BooleanQueryPtr finalQuery = newLucene(); + finalQuery->add(mainQuery, BooleanClause::MUST); + finalQuery->add(pathPrefixQuery, BooleanClause::MUST); + qInfo() << "Using multi-path prefix query for OCR text search optimization:" << searchPathsList; + mainQuery = finalQuery; + } } } // Add time range filter query if (m_options.hasTimeRangeFilter()) { TimeRangeFilter filter = m_options.timeRangeFilter(); - auto [start, end] = filter.resolveTimeRange(); - - qint64 startEpoch = TimeRangeUtils::toEpochSecs(start); - qint64 endEpoch = TimeRangeUtils::toEpochSecs(end); - - const wchar_t *fieldName = (filter.timeField() == TimeField::BirthTime) - ? LuceneFieldNames::OcrText::kBirthTime - : LuceneFieldNames::OcrText::kModifyTime; - - QueryPtr timeQuery = TimeRangeUtils::buildNumericRangeQuery( - fieldName, startEpoch, endEpoch, - filter.includeLower(), filter.includeUpper()); + QueryPtr timeQuery = TimeRangeUtils::buildTimeRangeFilterQuery( + filter, + LuceneFieldNames::OcrText::kBirthTime, + LuceneFieldNames::OcrText::kModifyTime); if (timeQuery) { if (mainQuery) { @@ -276,6 +278,7 @@ void OcrTextIndexedStrategy::processSearchResults(const Lucene::IndexSearcherPtr QString searchPath = m_options.searchPath(); const QStringList &searchExcludedPaths = m_options.searchExcludedPaths(); + QStringList allSearchPaths = m_options.searchPaths(); auto docsSize = scoreDocs.size(); OcrTextOptionsAPI optAPI(m_options); @@ -333,7 +336,9 @@ void OcrTextIndexedStrategy::processSearchResults(const Lucene::IndexSearcherPtr QString path = QString::fromStdWString(pathField); - if (!path.startsWith(searchPath)) { + // Check against all search paths + if (!std::any_of(allSearchPaths.cbegin(), allSearchPaths.cend(), + [&path](const auto &sp) { return path.startsWith(sp); })) { continue; } diff --git a/src/dfm-search/dfm-search-lib/semantic/extractors/actionextractor.cpp b/src/dfm-search/dfm-search-lib/semantic/extractors/actionextractor.cpp index 79e421e3..52ed77c9 100644 --- a/src/dfm-search/dfm-search-lib/semantic/extractors/actionextractor.cpp +++ b/src/dfm-search/dfm-search-lib/semantic/extractors/actionextractor.cpp @@ -4,7 +4,7 @@ #include "actionextractor.h" -#include "../semanticruleengine.h" +#include "semantic/semanticruleengine.h" #include diff --git a/src/dfm-search/dfm-search-lib/semantic/extractors/locationextractor.cpp b/src/dfm-search/dfm-search-lib/semantic/extractors/locationextractor.cpp index 854d8ef6..ef129636 100644 --- a/src/dfm-search/dfm-search-lib/semantic/extractors/locationextractor.cpp +++ b/src/dfm-search/dfm-search-lib/semantic/extractors/locationextractor.cpp @@ -4,7 +4,7 @@ #include "locationextractor.h" -#include "../semanticruleengine.h" +#include "semantic/semanticruleengine.h" #include #include diff --git a/src/dfm-search/dfm-search-lib/semantic/extractors/sizeextractor.cpp b/src/dfm-search/dfm-search-lib/semantic/extractors/sizeextractor.cpp index 7fe51f4c..46794f27 100644 --- a/src/dfm-search/dfm-search-lib/semantic/extractors/sizeextractor.cpp +++ b/src/dfm-search/dfm-search-lib/semantic/extractors/sizeextractor.cpp @@ -4,7 +4,7 @@ #include "sizeextractor.h" -#include "../semanticruleengine.h" +#include "semantic/semanticruleengine.h" #include diff --git a/src/dfm-search/dfm-search-lib/semantic/extractors/timeextractor.cpp b/src/dfm-search/dfm-search-lib/semantic/extractors/timeextractor.cpp index 5f04389f..30501f58 100644 --- a/src/dfm-search/dfm-search-lib/semantic/extractors/timeextractor.cpp +++ b/src/dfm-search/dfm-search-lib/semantic/extractors/timeextractor.cpp @@ -4,7 +4,7 @@ #include "timeextractor.h" -#include "../semanticruleengine.h" +#include "semantic/semanticruleengine.h" #include #include @@ -38,15 +38,15 @@ void TimeExtractor::extract(const QString &input, ParsedIntent &intent) if (typeStr == "preset") { const QString presetStr = metadata.value("preset").toString(); static const QMap kPresetMap = { - {"today", TimePreset::Today}, - {"yesterday", TimePreset::Yesterday}, - {"day_before_yesterday", TimePreset::DayBeforeYesterday}, - {"this_week", TimePreset::ThisWeek}, - {"last_week", TimePreset::LastWeek}, - {"this_month", TimePreset::ThisMonth}, - {"last_month", TimePreset::LastMonth}, - {"this_year", TimePreset::ThisYear}, - {"last_year", TimePreset::LastYear}, + { "today", TimePreset::Today }, + { "yesterday", TimePreset::Yesterday }, + { "day_before_yesterday", TimePreset::DayBeforeYesterday }, + { "this_week", TimePreset::ThisWeek }, + { "last_week", TimePreset::LastWeek }, + { "this_month", TimePreset::ThisMonth }, + { "last_month", TimePreset::LastMonth }, + { "this_year", TimePreset::ThisYear }, + { "last_year", TimePreset::LastYear }, }; if (kPresetMap.contains(presetStr)) { @@ -70,8 +70,8 @@ void TimeExtractor::extract(const QString &input, ParsedIntent &intent) } void TimeExtractor::parseCustomTime(const QRegularExpressionMatch &match, - const QVariantMap &metadata, - TimeConstraint &tc) + const QVariantMap &metadata, + TimeConstraint &tc) { Q_UNUSED(metadata); diff --git a/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.cpp b/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.cpp index fca6c078..65d074cf 100644 --- a/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.cpp +++ b/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.cpp @@ -187,12 +187,13 @@ TimeRangeFilter SemanticQueryBuilder::buildTimeRangeFilter(const TimeConstraint break; } - // Set time field if explicitly specified by ActionExtractor + // Set time field on the filter if (tc.timeField == TimeField::BirthTime || tc.timeField == TimeField::ModifyTime) { filter.setTimeField(tc.timeField); + } else if (tc.timeField == TimeField::Unspecified || tc.timeField == TimeField::Both) { + // No specific time field or both requested → search both birth and modify time + filter.setTimeField(TimeField::Both); } - // When Unspecified or Both: timeField is NOT set on the filter here. - // The searcher handles Both by creating duplicate engines with each field. return filter; } diff --git a/src/dfm-search/dfm-search-lib/semantic/semanticsearcher.cpp b/src/dfm-search/dfm-search-lib/semantic/semanticsearcher.cpp index d96bb08b..e02d46d4 100644 --- a/src/dfm-search/dfm-search-lib/semantic/semanticsearcher.cpp +++ b/src/dfm-search/dfm-search-lib/semantic/semanticsearcher.cpp @@ -9,6 +9,7 @@ #include "semanticruleengine.h" #include +#include #include #include @@ -18,11 +19,7 @@ DFM_SEARCH_BEGIN_NS SemanticSearcherData::SemanticSearcherData(SemanticSearcher *q_ptr) - : q(q_ptr) - , ruleEngine(new SemanticRuleEngine(q)) - , intentParser(new IntentParser(ruleEngine)) - , queryBuilder(new SemanticQueryBuilder()) - , timeoutTimer(new QTimer(q)) + : q(q_ptr), ruleEngine(new SemanticRuleEngine(q)), intentParser(new IntentParser(ruleEngine)), queryBuilder(new SemanticQueryBuilder()), timeoutTimer(new QTimer(q)) { timeoutTimer->setSingleShot(true); timeoutTimer->setInterval(timeoutSeconds * 1000); @@ -50,6 +47,7 @@ void SemanticSearcherData::doSearch(const QString &naturalLanguage) return; } + // Step 1: Validate + reset state cancelled.store(false); allResults.clear(); seenPaths.clear(); @@ -57,39 +55,19 @@ void SemanticSearcherData::doSearch(const QString &naturalLanguage) Q_EMIT q->statusChanged(SearchStatus::Searching); Q_EMIT q->searchStarted(); - // Step 1: Parse natural language into intent + // Step 2: Parse natural language into intent ParsedIntent intent; intentParser->parse(naturalLanguage, intent); - // Step 2: Build search plan + // Step 3: Build search plan const SemanticSearchPlan plan = queryBuilder->build(intent); - // Step 3: Determine time fields to search - QList timeFields; - if (plan.timeField == TimeField::Both) { - timeFields = {TimeField::BirthTime, TimeField::ModifyTime}; - } else { - timeFields = {plan.timeField}; - } - - // Step 3b: Determine search directories + // Step 4: Determine search directories QStringList dirs = plan.searchDirectories.isEmpty() - ? QStringList{QDir::homePath()} - : plan.searchDirectories; - - // Helper: apply time field to options (clone + setTimeField if time filter present) - auto applyTimeField = [](const SearchOptions &opts, TimeField tf) -> SearchOptions { - TimeRangeFilter tfCopy = opts.timeRangeFilter(); - if (tfCopy.isValid()) { - tfCopy.setTimeField(tf); - SearchOptions result = opts; - result.setTimeRangeFilter(tfCopy); - return result; - } - return opts; - }; + ? QStringList { QDir::homePath() } + : plan.searchDirectories; - // Step 4: Create and launch search engines in parallel + // Step 5: Set up signal/slot handlers auto onResultsFound = [this](const SearchResultList &results) { SearchResultList newResults; for (const SearchResult &r : results) { @@ -120,100 +98,87 @@ void SemanticSearcherData::doSearch(const QString &naturalLanguage) } }; - auto onError = [this](const SearchError &error) { + auto onError = [](const SearchError &error) { qWarning() << "Search error:" << error.message(); // Don't propagate individual engine errors to caller // The other engines may still produce valid results }; - // Count how many engines we'll launch + // Step 6: Clean up any previous search engines pendingFinishCount.store(0); - - // Clean up any previous search engines (they have parent q, so Qt deletes them) for (SearchEngine *e : engines) { e->deleteLater(); } engines.clear(); - // Launch engines for each directory and each time field - for (const QString &dir : dirs) { - for (TimeField tf : timeFields) { - // File name search (always) - if (Global::isFileNameIndexReadyForSearch()) { - SearchOptions fnameOpts = applyTimeField(plan.fileNameOptions, tf); - fnameOpts.setSearchPath(dir); - if (plan.includeHidden) { - fnameOpts.setIncludeHidden(true); - } - - SearchEngine *engine = SearchEngine::create(SearchType::FileName, q); - engine->setSearchOptions(fnameOpts); - - QObject::connect(engine, &SearchEngine::resultsFound, q, onResultsFound); - QObject::connect(engine, &SearchEngine::searchFinished, q, onFinished); - QObject::connect(engine, &SearchEngine::errorOccurred, q, onError); - - engines.append(engine); - pendingFinishCount.fetch_add(1); - engine->search(plan.fileNameQuery); - } - - // Content search - if (plan.contentQuery.has_value() && plan.contentOptions.has_value()) { - SearchOptions contentOpts = applyTimeField(*plan.contentOptions, tf); - contentOpts.setSearchPath(dir); - if (plan.includeHidden) { - contentOpts.setIncludeHidden(true); - } - - SearchEngine *engine = SearchEngine::create(SearchType::Content, q); - engine->setSearchOptions(contentOpts); - - QObject::connect(engine, &SearchEngine::resultsFound, q, onResultsFound); - QObject::connect(engine, &SearchEngine::searchFinished, q, onFinished); - QObject::connect(engine, &SearchEngine::errorOccurred, q, onError); - - engines.append(engine); - pendingFinishCount.fetch_add(1); - engine->search(*plan.contentQuery); - } + // Step 7: Helper to prepare options with multi-path and hidden settings + auto prepareOptions = [&dirs, &plan](const SearchOptions &baseOpts) -> SearchOptions { + SearchOptions opts = baseOpts; + opts.setSearchPaths(dirs); + if (plan.includeHidden) { + opts.setIncludeHidden(true); + } + return opts; + }; - // OCR search - if (plan.ocrQuery.has_value() && plan.ocrOptions.has_value()) { - SearchOptions ocrOpts = applyTimeField(*plan.ocrOptions, tf); - ocrOpts.setSearchPath(dir); - if (plan.includeHidden) { - ocrOpts.setIncludeHidden(true); - } + // Step 8: Launch up to 3 engines (FileName, Content, OCR) + // TimeField::Both is no longer expanded here; it is handled by the Lucene strategy layer. + // Multiple directories are passed via setSearchPaths(). - SearchEngine *engine = SearchEngine::create(SearchType::Ocr, q); - engine->setSearchOptions(ocrOpts); + // File name search (always, if index is ready) + if (Global::isFileNameIndexReadyForSearch()) { + SearchOptions fnameOpts = prepareOptions(plan.fileNameOptions); + createAndLaunchEngine(SearchType::FileName, plan.fileNameQuery, + fnameOpts, onResultsFound, onFinished, onError); + } - QObject::connect(engine, &SearchEngine::resultsFound, q, onResultsFound); - QObject::connect(engine, &SearchEngine::searchFinished, q, onFinished); - QObject::connect(engine, &SearchEngine::errorOccurred, q, onError); + // Content search + if (plan.contentQuery.has_value() && plan.contentOptions.has_value()) { + SearchOptions contentOpts = prepareOptions(*plan.contentOptions); + createAndLaunchEngine(SearchType::Content, *plan.contentQuery, + contentOpts, onResultsFound, onFinished, onError); + } - engines.append(engine); - pendingFinishCount.fetch_add(1); - engine->search(*plan.ocrQuery); - } - } + // OCR search + if (plan.ocrQuery.has_value() && plan.ocrOptions.has_value()) { + SearchOptions ocrOpts = prepareOptions(*plan.ocrOptions); + createAndLaunchEngine(SearchType::Ocr, *plan.ocrQuery, + ocrOpts, onResultsFound, onFinished, onError); } - // If no engines were launched (e.g., no indexes available) + // Step 9: Handle no-engine case if (pendingFinishCount.load() == 0) { timeoutTimer->stop(); status.store(SearchStatus::Finished); Q_EMIT q->statusChanged(SearchStatus::Finished); Q_EMIT q->searchFinished({}); } else { - // Start timeout timer if (timeoutSeconds > 0) { timeoutTimer->start(); } } } +void SemanticSearcherData::createAndLaunchEngine( + SearchType type, + const SearchQuery &query, + const SearchOptions &options, + std::function onResultsFound, + std::function onFinished, + std::function onError) +{ + SearchEngine *engine = SearchEngine::create(type, q); + engine->setSearchOptions(options); + + QObject::connect(engine, &SearchEngine::resultsFound, q, onResultsFound); + QObject::connect(engine, &SearchEngine::searchFinished, q, onFinished); + QObject::connect(engine, &SearchEngine::errorOccurred, q, onError); + + engines.append(engine); + pendingFinishCount.fetch_add(1); + engine->search(query); +} + void SemanticSearcherData::doCancel() { cancelled.store(true); @@ -227,8 +192,7 @@ void SemanticSearcherData::doCancel() // --- SemanticSearcher public API --- SemanticSearcher::SemanticSearcher(QObject *parent) - : QObject(parent) - , d_ptr(new SemanticSearcherData(this)) + : QObject(parent), d_ptr(new SemanticSearcherData(this)) { } @@ -260,6 +224,21 @@ void SemanticSearcher::search(const QString &naturalLanguage) d_ptr->doSearch(naturalLanguage); } +bool SemanticSearcher::isSemanticQuery(const QString &input) const +{ + if (input.trimmed().isEmpty()) { + return false; + } + + ParsedIntent intent; + d_ptr->intentParser->parse(input, intent); + + return intent.timeConstraint.isValid() + || intent.sizeConstraint.isValid() + || !intent.fileExtensions.isEmpty() + || !intent.searchDirectories.isEmpty(); +} + void SemanticSearcher::cancel() { d_ptr->doCancel(); diff --git a/src/dfm-search/dfm-search-lib/semantic/semanticsearcher_p.h b/src/dfm-search/dfm-search-lib/semantic/semanticsearcher_p.h index b0a1befd..ef1d2801 100644 --- a/src/dfm-search/dfm-search-lib/semantic/semanticsearcher_p.h +++ b/src/dfm-search/dfm-search-lib/semantic/semanticsearcher_p.h @@ -8,8 +8,12 @@ #include #include +#include + #include #include +#include +#include DFM_SEARCH_BEGIN_NS @@ -17,6 +21,7 @@ class SemanticRuleEngine; class IntentParser; class SemanticQueryBuilder; class SemanticSearchPlan; +class SemanticSearcher; class SemanticSearcherData { @@ -27,11 +32,32 @@ class SemanticSearcherData void doSearch(const QString &naturalLanguage); void doCancel(); + /** + * @brief Create, configure, and launch a search engine + * + * Creates a SearchEngine of the given type, sets its options, connects + * signal/slot handlers, appends it to the engines list, increments the + * pending finish counter, and starts the search. + * + * @param type The search engine type (FileName, Content, or Ocr) + * @param query The search query to execute + * @param options The search options (including multi-path and time filter) + * @param onResultsFound Callback for result aggregation + * @param onFinished Callback for engine completion tracking + * @param onError Callback for error handling + */ + void createAndLaunchEngine(SearchType type, + const SearchQuery &query, + const SearchOptions &options, + std::function onResultsFound, + std::function onFinished, + std::function onError); + SemanticSearcher *q = nullptr; // State - std::atomic status{SearchStatus::Ready}; - std::atomic cancelled{false}; + std::atomic status { SearchStatus::Ready }; + std::atomic cancelled { false }; int timeoutSeconds = 60; // Core components (owned) @@ -41,7 +67,7 @@ class SemanticSearcherData // Sub-engines (owned per search, parented to q for auto-cleanup) QList engines; - std::atomic pendingFinishCount{0}; + std::atomic pendingFinishCount { 0 }; // Result collection SearchResultList allResults; diff --git a/src/dfm-search/dfm-search-lib/utils/lucenequeryutils.cpp b/src/dfm-search/dfm-search-lib/utils/lucenequeryutils.cpp index 0e1a93c2..603fad88 100644 --- a/src/dfm-search/dfm-search-lib/utils/lucenequeryutils.cpp +++ b/src/dfm-search/dfm-search-lib/utils/lucenequeryutils.cpp @@ -5,6 +5,8 @@ #include +#include + DFM_SEARCH_BEGIN_NS namespace LuceneQueryUtils { @@ -65,6 +67,30 @@ Lucene::QueryPtr buildPathPrefixQuery(const QString &pathPrefix, const QString & Lucene::StringUtils::toUnicode(normalizedPath.toStdString()))); } +Lucene::QueryPtr buildMultiPathPrefixQuery(const QStringList &paths, const QString &fieldName) +{ + if (paths.isEmpty() || fieldName.isEmpty()) { + return nullptr; + } + + if (paths.size() == 1) { + return buildPathPrefixQuery(paths.first(), fieldName); + } + + Lucene::BooleanQueryPtr boolQuery = Lucene::newLucene(); + bool hasValid = false; + + for (const QString &path : paths) { + Lucene::QueryPtr pathQuery = buildPathPrefixQuery(path, fieldName); + if (pathQuery) { + boolQuery->add(pathQuery, Lucene::BooleanClause::SHOULD); + hasValid = true; + } + } + + return hasValid ? boolQuery : nullptr; +} + } // namespace LuceneQueryUtils DFM_SEARCH_END_NS diff --git a/src/dfm-search/dfm-search-lib/utils/lucenequeryutils.h b/src/dfm-search/dfm-search-lib/utils/lucenequeryutils.h index 1c5ead25..c7d1c834 100644 --- a/src/dfm-search/dfm-search-lib/utils/lucenequeryutils.h +++ b/src/dfm-search/dfm-search-lib/utils/lucenequeryutils.h @@ -39,6 +39,18 @@ std::wstring getLuceneSpecialChars(); */ Lucene::QueryPtr buildPathPrefixQuery(const QString &pathPrefix, const QString &fieldName); +/** + * @brief Build a multi-path prefix query for Lucene + * + * When multiple paths are provided, builds a BooleanQuery with SHOULD clauses + * for each path. When only one path is provided, returns a simple pathPrefixQuery. + * + * @param paths List of path prefixes to search for + * @param fieldName The index field name (e.g., "ancestor_paths") + * @return Lucene query object, or nullptr if paths is empty + */ +Lucene::QueryPtr buildMultiPathPrefixQuery(const QStringList &paths, const QString &fieldName); + } // namespace LuceneQueryUtils DFM_SEARCH_END_NS diff --git a/src/dfm-search/dfm-search-lib/utils/timerangeutils.cpp b/src/dfm-search/dfm-search-lib/utils/timerangeutils.cpp index 3fddbf35..d13e5a95 100644 --- a/src/dfm-search/dfm-search-lib/utils/timerangeutils.cpp +++ b/src/dfm-search/dfm-search-lib/utils/timerangeutils.cpp @@ -5,6 +5,7 @@ #include #include +#include DFM_SEARCH_BEGIN_NS @@ -38,6 +39,51 @@ Lucene::QueryPtr buildNumericRangeQuery( includeUpper); } +Lucene::QueryPtr buildTimeRangeFilterQuery( + const TimeRangeFilter &filter, + const wchar_t *birthTimeField, + const wchar_t *modifyTimeField) +{ + if (!filter.isValid()) { + return nullptr; + } + + auto [start, end] = filter.resolveTimeRange(); + qint64 startEpoch = toEpochSecs(start); + qint64 endEpoch = toEpochSecs(end); + + if (filter.timeField() == TimeField::Both) { + // Build BooleanQuery with SHOULD for both time fields + Lucene::BooleanQueryPtr timeBoolQuery = Lucene::newLucene(); + + Lucene::QueryPtr birthQuery = buildNumericRangeQuery( + birthTimeField, startEpoch, endEpoch, + filter.includeLower(), filter.includeUpper()); + if (birthQuery) { + timeBoolQuery->add(birthQuery, Lucene::BooleanClause::SHOULD); + } + + Lucene::QueryPtr modifyQuery = buildNumericRangeQuery( + modifyTimeField, startEpoch, endEpoch, + filter.includeLower(), filter.includeUpper()); + if (modifyQuery) { + timeBoolQuery->add(modifyQuery, Lucene::BooleanClause::SHOULD); + } + + // Need at least one clause for a valid BooleanQuery + return (birthQuery || modifyQuery) ? timeBoolQuery : nullptr; + } + + // Single field query + const wchar_t *fieldName = (filter.timeField() == TimeField::BirthTime) + ? birthTimeField + : modifyTimeField; + + return buildNumericRangeQuery( + fieldName, startEpoch, endEpoch, + filter.includeLower(), filter.includeUpper()); +} + } // namespace TimeRangeUtils DFM_SEARCH_END_NS diff --git a/src/dfm-search/dfm-search-lib/utils/timerangeutils.h b/src/dfm-search/dfm-search-lib/utils/timerangeutils.h index 72e9196c..6db71022 100644 --- a/src/dfm-search/dfm-search-lib/utils/timerangeutils.h +++ b/src/dfm-search/dfm-search-lib/utils/timerangeutils.h @@ -12,6 +12,8 @@ DFM_SEARCH_BEGIN_NS +class TimeRangeFilter; + /** * @brief TimeRangeUtils provides utility functions for time range operations */ @@ -40,6 +42,25 @@ Lucene::QueryPtr buildNumericRangeQuery( bool includeLower, bool includeUpper); +/** + * @brief Build a Lucene query for time range filtering, supporting TimeField::Both + * + * When filter.timeField() is TimeField::Both, this builds a BooleanQuery with SHOULD + * clauses for both birth_time and modify_time fields. Otherwise, it builds a single + * NumericRangeQuery for the specified field. + * + * The returned query is designed to be added to an outer query with BooleanClause::MUST. + * + * @param filter The time range filter containing field selection and range + * @param birthTimeField The Lucene field name for birth time + * @param modifyTimeField The Lucene field name for modification time + * @return A Lucene query (single NumericRangeQuery or BooleanQuery for Both), or nullptr if invalid + */ +Lucene::QueryPtr buildTimeRangeFilterQuery( + const TimeRangeFilter &filter, + const wchar_t *birthTimeField, + const wchar_t *modifyTimeField); + } // namespace TimeRangeUtils DFM_SEARCH_END_NS From ecc50e11f72ceac32893367b65e8410f921d8630 Mon Sep 17 00:00:00 2001 From: Zhang Sheng Date: Fri, 15 May 2026 09:04:40 +0800 Subject: [PATCH 10/36] feat: add file size range filter to search strategies MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Implemented file size range filtering for both content and OCR text search strategies 2. Added logic to build numeric range queries for file sizes using TimeRangeUtils 3. Integrated size filter with existing search queries using BooleanQuery 4. The filter works both in combination with other queries and as standalone query 5. Supports inclusive/exclusive bounds through SizeRangeFilter settings Log: Added file size range filtering in search results Influence: 1. Test search with various size filters (min, max, both) 2. Verify inclusive/exclusive bounds work correctly 3. Test in combination with other search criteria (filename, content, etc) 4. Check performance impact with large file collections 5. Verify edge cases (empty range, invalid values) 6. Test with extreme size values (0, very large files) feat: 在搜索策略中添加文件大小范围过滤 1. 为内容和OCR文本搜索策略实现了文件大小范围过滤 2. 添加了使用TimeRangeUtils构建文件大小数字范围查询的逻辑 3. 通过BooleanQuery将大小过滤器与现有搜索查询集成 4. 该过滤器既可与其他查询组合使用,也可作为独立查询 5. 通过SizeRangeFilter设置支持包含/排除边界 Log: 在搜索结果中添加文件大小范围过滤功能 Influence: 1. 测试不同大小范围过滤条件的搜索(最小值、最大值、两者组合) 2. 验证包含/排除边界工作正常 3. 测试与其他搜索条件组合使用的情况(文件名、内容等) 4. 检查大文件集合的性能影响 5. 验证边界情况(空范围、无效值) 6. 测试极端大小值(0、超大文件) --- .../contentstrategies/indexedstrategy.cpp | 21 +++++++++++++++++++ .../ocrtextstrategies/indexedstrategy.cpp | 21 +++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.cpp b/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.cpp index 12c35690..35dadaeb 100644 --- a/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.cpp +++ b/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.cpp @@ -145,6 +145,27 @@ Lucene::QueryPtr ContentIndexedStrategy::buildLuceneQuery(const SearchQuery &que } } + // Add file size range filter query + if (m_options.hasSizeRangeFilter()) { + SizeRangeFilter sizeFilter = m_options.sizeRangeFilter(); + QueryPtr sizeQuery = TimeRangeUtils::buildNumericRangeQuery( + LuceneFieldNames::Content::kFileSize, + sizeFilter.minSize(), sizeFilter.maxSize(), + sizeFilter.includeLower(), sizeFilter.includeUpper()); + + if (sizeQuery) { + if (mainQuery) { + BooleanQueryPtr finalQuery = newLucene(); + finalQuery->add(mainQuery, BooleanClause::MUST); + finalQuery->add(sizeQuery, BooleanClause::MUST); + mainQuery = finalQuery; + } else { + // Size filter alone is a valid query + mainQuery = sizeQuery; + } + } + } + // Add filename keyword query QString filenameKw = optAPI.filenameKeyword(); if (!filenameKw.isEmpty()) { diff --git a/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.cpp b/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.cpp index 8bde5aab..d83ac4b4 100644 --- a/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.cpp +++ b/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.cpp @@ -142,6 +142,27 @@ Lucene::QueryPtr OcrTextIndexedStrategy::buildLuceneQuery(const SearchQuery &que } } + // Add file size range filter query + if (m_options.hasSizeRangeFilter()) { + SizeRangeFilter sizeFilter = m_options.sizeRangeFilter(); + QueryPtr sizeQuery = TimeRangeUtils::buildNumericRangeQuery( + LuceneFieldNames::OcrText::kFileSize, + sizeFilter.minSize(), sizeFilter.maxSize(), + sizeFilter.includeLower(), sizeFilter.includeUpper()); + + if (sizeQuery) { + if (mainQuery) { + BooleanQueryPtr finalQuery = newLucene(); + finalQuery->add(mainQuery, BooleanClause::MUST); + finalQuery->add(sizeQuery, BooleanClause::MUST); + mainQuery = finalQuery; + } else { + // Size filter alone is a valid query + mainQuery = sizeQuery; + } + } + } + // Add filename keyword query QString filenameKw = optAPI.filenameKeyword(); if (!filenameKw.isEmpty()) { From 868ba70e16ff5abb7dbd40fa1512041a30aa8dc9 Mon Sep 17 00:00:00 2001 From: Zhang Sheng Date: Fri, 15 May 2026 09:17:30 +0800 Subject: [PATCH 11/36] fix: unify dfm-search library and path names MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Changed installation paths to consistently use 'dfm-search' instead of dynamic library name 2. Removed DFM_SEARCH_LIB_NAME compile definition as it's no longer needed 3. Updated path references in comments to match new standardized naming 4. Simplified semantic rules path construction by removing library name dependency These changes standardize the naming scheme and paths across the codebase, making configuration and maintenance simpler. Previously different versions (dfm-search vs dfm6-search) had slightly different paths and configuration, which could lead to inconsistencies. Influence: 1. Verify semantic rule loading from standard /usr/share/deepin/dfm- search path 2. Check user configuration fallback in ~/.config/deepin/dfm-search 3. Test search functionality to ensure rules are properly loaded from new paths 4. Verify installation creates correct directory structure under /usr/ share/deepin/dfm-search fix: 统一 dfm-search 库名称和路径命名 1. 将安装路径统一改为使用 'dfm-search' 而不是动态库名称 2. 移除了不再需要的 DFM_SEARCH_LIB_NAME 编译定义 3. 更新注释中的路径引用以匹配新的标准化命名 4. 通过移除库名称依赖简化了语义规则路径构建 这些更改在整个代码库中标准化了命名方案和路径,使配置和维护更简单。之前不 同版本(dfm-search/d6m-search)有稍微不同的路径和配置,可能导致不一致。 Influence: 1. 验证从标准路径/usr/share/deepin/dfm-search加载语义规则 2. 检查用户配置回退路径~/.config/deepin/dfm-search 3. 测试搜索功能确保从新路径正确加载规则 4. 验证安装时在/usr/share/deepin/dfm-search下创建了正确的目录结构 --- debian/libdfm-search.install | 2 +- debian/libdfm6-search.install | 2 +- src/dfm-search/dfm-search-lib/dfm-search.cmake | 3 +-- src/dfm-search/dfm-search-lib/semantic/ruleconfigloader.cpp | 6 +----- src/dfm-search/dfm-search-lib/semantic/ruleconfigloader.h | 4 ++-- 5 files changed, 6 insertions(+), 11 deletions(-) diff --git a/debian/libdfm-search.install b/debian/libdfm-search.install index 8c90bd37..01f76db8 100644 --- a/debian/libdfm-search.install +++ b/debian/libdfm-search.install @@ -1,3 +1,3 @@ usr/lib/*/libdfm-search*.so* usr/bin/dfm-searcher -share/deepin/dfm-search/semantic/rules/* \ No newline at end of file +usr/share/deepin/dfm-search/semantic/rules/* \ No newline at end of file diff --git a/debian/libdfm6-search.install b/debian/libdfm6-search.install index f4bb6e06..c488f7e4 100644 --- a/debian/libdfm6-search.install +++ b/debian/libdfm6-search.install @@ -1,3 +1,3 @@ usr/lib/*/libdfm6-search*.so* usr/bin/dfm-searcher -share/deepin/dfm6-search/semantic/rules/* \ No newline at end of file +usr/share/deepin/dfm-search/semantic/rules/* \ No newline at end of file diff --git a/src/dfm-search/dfm-search-lib/dfm-search.cmake b/src/dfm-search/dfm-search-lib/dfm-search.cmake index 74181757..6b0ce4f2 100644 --- a/src/dfm-search/dfm-search-lib/dfm-search.cmake +++ b/src/dfm-search/dfm-search-lib/dfm-search.cmake @@ -20,7 +20,6 @@ add_library(${BIN_NAME} SHARED target_compile_definitions(${BIN_NAME} PRIVATE CMAKE_INSTALL_PREFIX="${CMAKE_INSTALL_PREFIX}" - DFM_SEARCH_LIB_NAME="${BIN_NAME}" ) target_link_libraries(${BIN_NAME} PUBLIC @@ -84,7 +83,7 @@ install(DIRECTORY # install semantic search rules (locale subdirectories preserved) install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/semantic/rules/ - DESTINATION share/deepin/${BIN_NAME}/semantic/rules + DESTINATION share/deepin/dfm-search/semantic/rules ) # for pc file config - update to include all dependencies diff --git a/src/dfm-search/dfm-search-lib/semantic/ruleconfigloader.cpp b/src/dfm-search/dfm-search-lib/semantic/ruleconfigloader.cpp index 39d78355..bcf3baf3 100644 --- a/src/dfm-search/dfm-search-lib/semantic/ruleconfigloader.cpp +++ b/src/dfm-search/dfm-search-lib/semantic/ruleconfigloader.cpp @@ -23,11 +23,7 @@ constexpr auto kInstallPrefix = CMAKE_INSTALL_PREFIX; constexpr auto kInstallPrefix = "/usr"; #endif -#ifdef DFM_SEARCH_LIB_NAME -constexpr auto kLibName = DFM_SEARCH_LIB_NAME; -#else -constexpr auto kLibName = "dfm6-search"; -#endif +constexpr auto kLibName = "dfm-search"; } // namespace QString RuleConfigLoader::libName() diff --git a/src/dfm-search/dfm-search-lib/semantic/ruleconfigloader.h b/src/dfm-search/dfm-search-lib/semantic/ruleconfigloader.h index 34a6469c..82c6656f 100644 --- a/src/dfm-search/dfm-search-lib/semantic/ruleconfigloader.h +++ b/src/dfm-search/dfm-search-lib/semantic/ruleconfigloader.h @@ -22,8 +22,8 @@ struct RuleGroup; * zh_CN -> zh -> zh_CN (default) * * Priority: user-local config > system-installed config. - * System path: /usr/share/deepin//semantic/rules/ - * User path: ~/.config/deepin//semantic/rules/ + * System path: /usr/share/deepin/dfm-search/semantic/rules/ + * User path: ~/.config/deepin/dfm-search/semantic/rules/ */ class RuleConfigLoader { From 2cfb3942196160a5bbd992bb7f09e5380dd1b78d Mon Sep 17 00:00:00 2001 From: Zhang Sheng Date: Fri, 15 May 2026 10:09:19 +0800 Subject: [PATCH 12/36] feat: add file metadata attributes to search results MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Added checksum support for OCR search results via new methods checksum() and setChecksum() 2. Added file size attribute support for both text and OCR search results 3. Implemented file size retrieval from Lucene index 4. Updated output formatting to display both checksum and file size 5. Added file size processing in indexed search strategies These changes enhance search result metadata by incorporating file verification (checksums) and size information, which improves file identification and management capabilities. Log: Added file checksum and size information to search results Influence: 1. Test OCR searches verify checksum display 2. Verify file size appears correctly in both text and OCR search results 3. Test with files of various sizes 4. Verify empty/null cases when attributes are not available 5. Test search performance with new metadata attributes feat: 为搜索结果添加文件元数据属性 1. 通过新增 checksum() 和 setChecksum() 方法为 OCR 搜索结果添加校验和 支持 2. 为文本和 OCR 搜索结果添加文件大小属性支持 3. 实现从 Lucene 索引中检索文件大小 4. 更新输出格式以显示校验和和文件大小信息 5. 在索引搜索策略中添加文件大小处理 这些改动通过整合文件验证(校验和)和尺寸信息增强了搜索结果元数据, 提高了文件识别和管理能力。 Log: 在搜索结果中添加文件校验和和大小信息 Influence: 1. 测试 OCR 搜索验证校验和显示 2. 验证文件大小在文本和 OCR 搜索结果中正确显示 3. 测试各种大小文件的处理 4. 验证当属性不可用时的空/空值情况处理 5. 测试新增元数据属性对搜索性能的影响 --- .../dfm-search/dfm-search/ocrtextsearchapi.h | 12 ++++++++++++ include/dfm-search/dfm-search/textsearchapi.h | 14 ++++++++++++++ .../dfm-search-client/output/json_output.cpp | 18 ++++++++++++++++++ .../dfm-search-client/output/text_output.cpp | 18 ++++++++++++++++++ .../contentstrategies/indexedstrategy.cpp | 10 ++++++++++ .../ocrtextsearch/ocrtextsearchapi.cpp | 10 ++++++++++ .../ocrtextstrategies/indexedstrategy.cpp | 16 ++++++++++++++++ .../textsearch/textsearchapi.cpp | 10 ++++++++++ 8 files changed, 108 insertions(+) diff --git a/include/dfm-search/dfm-search/ocrtextsearchapi.h b/include/dfm-search/dfm-search/ocrtextsearchapi.h index 51ef0207..ff8c9298 100644 --- a/include/dfm-search/dfm-search/ocrtextsearchapi.h +++ b/include/dfm-search/dfm-search/ocrtextsearchapi.h @@ -75,6 +75,18 @@ class OcrTextResultAPI : public TextSearchResultAPI * @param content The OCR extracted text to set */ void setOcrContent(const QString &content); + + /** + * @brief Get the file checksum + * @return The checksum string, or empty if not set + */ + QString checksum() const; + + /** + * @brief Set the file checksum + * @param checksum The checksum string to set + */ + void setChecksum(const QString &checksum); }; DFM_SEARCH_END_NS diff --git a/include/dfm-search/dfm-search/textsearchapi.h b/include/dfm-search/dfm-search/textsearchapi.h index 00864d7d..c9ce5319 100644 --- a/include/dfm-search/dfm-search/textsearchapi.h +++ b/include/dfm-search/dfm-search/textsearchapi.h @@ -127,6 +127,20 @@ class TextSearchResultAPI */ void setHighlightedContent(const QString &content); + // ==================== File Size ==================== + + /** + * @brief Set the file size in bytes + * @param bytes File size in bytes + */ + void setFileSizeBytes(qint64 bytes); + + /** + * @brief Get the file size in bytes + * @return File size in bytes, 0 if not set + */ + qint64 fileSizeBytes() const; + // ==================== Extended Attributes ==================== /** diff --git a/src/dfm-search/dfm-search-client/output/json_output.cpp b/src/dfm-search/dfm-search-client/output/json_output.cpp index 0034013b..e3837ec3 100644 --- a/src/dfm-search/dfm-search-client/output/json_output.cpp +++ b/src/dfm-search/dfm-search-client/output/json_output.cpp @@ -112,6 +112,12 @@ QJsonValue JsonOutput::resultToJson(const SearchResult &result) obj["birthTime"] = birthTimeObj; } + // 文件大小 + qint64 sizeBytes = resultAPI.fileSizeBytes(); + if (sizeBytes > 0) { + obj["sizeBytes"] = sizeBytes; + } + return obj; } else if (m_searchType == SearchType::Ocr) { // OCR 搜索:返回详细对象 @@ -152,6 +158,18 @@ QJsonValue JsonOutput::resultToJson(const SearchResult &result) obj["birthTime"] = birthTimeObj; } + // 文件校验和 + QString checksum = resultAPI.checksum(); + if (!checksum.isEmpty()) { + obj["checksum"] = checksum; + } + + // 文件大小 + qint64 sizeBytes = resultAPI.fileSizeBytes(); + if (sizeBytes > 0) { + obj["sizeBytes"] = sizeBytes; + } + return obj; } return result.path(); diff --git a/src/dfm-search/dfm-search-client/output/text_output.cpp b/src/dfm-search/dfm-search-client/output/text_output.cpp index 63bef3ca..a9f269ad 100644 --- a/src/dfm-search/dfm-search-client/output/text_output.cpp +++ b/src/dfm-search/dfm-search-client/output/text_output.cpp @@ -150,6 +150,12 @@ void TextOutput::printSearchResult(const SearchResult &result) std::cout << " Created: " << resultAPI.birthTimeString().toStdString() << " (timestamp: " << birthTs << ")" << std::endl; } + + // 文件大小 + qint64 sizeBytes = resultAPI.fileSizeBytes(); + if (sizeBytes > 0) { + std::cout << " Size: " << sizeBytes << " bytes" << std::endl; + } } else if (m_searchType == SearchType::Ocr) { OcrTextResultAPI resultAPI(const_cast(result)); @@ -182,6 +188,18 @@ void TextOutput::printSearchResult(const SearchResult &result) std::cout << " Created: " << resultAPI.birthTimeString().toStdString() << " (timestamp: " << birthTs << ")" << std::endl; } + + // 文件校验和 + QString checksum = resultAPI.checksum(); + if (!checksum.isEmpty()) { + std::cout << " Checksum: " << checksum.toStdString() << std::endl; + } + + // 文件大小 + qint64 sizeBytes = resultAPI.fileSizeBytes(); + if (sizeBytes > 0) { + std::cout << " Size: " << sizeBytes << " bytes" << std::endl; + } } std::cout << std::endl; diff --git a/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.cpp b/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.cpp index 35dadaeb..5e8642d8 100644 --- a/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.cpp +++ b/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.cpp @@ -443,6 +443,16 @@ void ContentIndexedStrategy::processSearchResults(const Lucene::IndexSearcherPtr resultApi.setBirthTimestamp(timestamp); } } + + // 文件大小 + Lucene::String fileSizeField = doc->get(LuceneFieldNames::Content::kFileSize); + if (!fileSizeField.empty()) { + bool ok = false; + qint64 fileSize = QString::fromStdWString(fileSizeField).toLongLong(&ok); + if (ok && fileSize > 0) { + resultApi.setFileSizeBytes(fileSize); + } + } } // 添加到结果集合 diff --git a/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextsearchapi.cpp b/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextsearchapi.cpp index 30e3f8c8..6ad7bb32 100644 --- a/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextsearchapi.cpp +++ b/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextsearchapi.cpp @@ -44,4 +44,14 @@ void OcrTextResultAPI::setOcrContent(const QString &content) m_result.setCustomAttribute("ocrContent", content); } +QString OcrTextResultAPI::checksum() const +{ + return m_result.customAttribute("checksum").toString(); +} + +void OcrTextResultAPI::setChecksum(const QString &checksum) +{ + m_result.setCustomAttribute("checksum", checksum); +} + DFM_SEARCH_END_NS diff --git a/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.cpp b/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.cpp index d83ac4b4..9c0e6ad5 100644 --- a/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.cpp +++ b/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.cpp @@ -443,6 +443,22 @@ void OcrTextIndexedStrategy::processSearchResults(const Lucene::IndexSearcherPtr resultApi.setBirthTimestamp(timestamp); } } + + // 文件校验和 + Lucene::String checksumField = doc->get(LuceneFieldNames::OcrText::kCheckSum); + if (!checksumField.empty()) { + resultApi.setChecksum(QString::fromStdWString(checksumField)); + } + + // 文件大小 + Lucene::String fileSizeField = doc->get(LuceneFieldNames::OcrText::kFileSize); + if (!fileSizeField.empty()) { + bool ok = false; + qint64 fileSize = QString::fromStdWString(fileSizeField).toLongLong(&ok); + if (ok && fileSize > 0) { + resultApi.setFileSizeBytes(fileSize); + } + } } // Add to result collection diff --git a/src/dfm-search/dfm-search-lib/textsearch/textsearchapi.cpp b/src/dfm-search/dfm-search-lib/textsearch/textsearchapi.cpp index fe3ed78d..da5087b9 100644 --- a/src/dfm-search/dfm-search-lib/textsearch/textsearchapi.cpp +++ b/src/dfm-search/dfm-search-lib/textsearch/textsearchapi.cpp @@ -57,6 +57,16 @@ QString TextSearchOptionsAPI::filenameKeyword() const // ==================== TextSearchResultAPI ==================== +void TextSearchResultAPI::setFileSizeBytes(qint64 bytes) +{ + m_result.setCustomAttribute("fileSizeBytes", bytes); +} + +qint64 TextSearchResultAPI::fileSizeBytes() const +{ + return m_result.customAttribute("fileSizeBytes").toLongLong(); +} + TextSearchResultAPI::TextSearchResultAPI(SearchResult &result) : m_result(result) { From caa556dc7619b501961e0c083013718f6d240037 Mon Sep 17 00:00:00 2001 From: Zhang Sheng Date: Fri, 15 May 2026 16:36:50 +0800 Subject: [PATCH 13/36] fix: improve Chinese NLP search functionality MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Added support for Chinese unit size queries (兆/GB/MB/KB) in SizeExtractor 2. Renamed ruleFileNames() to ruleFilePaths() and improved rule file loading 3. Removed QFileSystemWatcher for rule file changes to simplify implementation 4. Updated noise rules with more common search lead-in phrases 5. Consolidated document type synonyms in filetype_rules.json 6. Improved test coverage for Chinese unit size queries Log: Enhanced Chinese NLP search with better size unit support and simplified rule management Influence: 1. Test searching with Chinese size units (e.g. "大于100兆的文件") 2. Verify document type search after synonym updates 3. Check handling of raw byte size queries 4. Test with various noise/lead-in phrases 5. Verify rule loading from both user and system directories fix: 改进中文自然语言搜索功能 1. 在SizeExtractor中添加对中文单位查询的支持(兆/GB/MB/KB) 2. 将ruleFileNames()重命名为ruleFilePaths()并改进规则文件加载 3. 移除了规则文件变更的QFileSystemWatcher以简化实现 4. 更新了噪声规则,包含更多常见搜索引导词 5. 合并了文件类型同义词文档 6. 增加了中文单位查询的测试覆盖率 Log: 改进中文自然语言搜索,优化尺寸单位支持和简化规则管理 Influence: 1. 测试使用中文尺寸单位的搜索(如"大于100兆的文件") 2. 验证更新同义词后的文档类型搜索 3. 检查原始字节大小查询的处理 4. 使用各种噪声词/引导词测试 5. 验证从用户和系统目录加载规则 --- .../dfm-search-tests/tst_chinese_nlp.cpp | 106 ++++++++++++------ .../dfm-search-tests/tst_semantic_search.cpp | 18 +-- .../semantic/extractors/sizeextractor.cpp | 39 ++++--- .../semantic/extractors/sizeextractor.h | 1 + .../semantic/ruleconfigloader.cpp | 38 +++++-- .../semantic/ruleconfigloader.h | 9 +- .../semantic/rules/zh_CN/filetype_rules.json | 2 +- .../semantic/rules/zh_CN/noise_rules.json | 10 +- .../semantic/rules/zh_CN/size_rules.json | 21 ++-- .../semantic/semanticquerybuilder.cpp | 4 +- .../semantic/semanticruleengine.cpp | 45 +------- .../semantic/semanticruleengine.h | 13 --- 12 files changed, 162 insertions(+), 144 deletions(-) diff --git a/autotests/dfm-search-tests/tst_chinese_nlp.cpp b/autotests/dfm-search-tests/tst_chinese_nlp.cpp index d1e3a3d2..0a63d245 100644 --- a/autotests/dfm-search-tests/tst_chinese_nlp.cpp +++ b/autotests/dfm-search-tests/tst_chinese_nlp.cpp @@ -17,8 +17,7 @@ using namespace DFMSEARCH; static QString rulesDir() { - return QStringLiteral(TEST_SOURCE_DIR) + QStringLiteral( - "/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN"); + return QStringLiteral(TEST_SOURCE_DIR) + QStringLiteral("/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN"); } // Helper: compare two QStringList as sets (order-independent) @@ -104,6 +103,10 @@ private Q_SLOTS: void size_dynamic_min(); void size_dynamic_max(); void size_dynamic_between(); + void size_chineseUnits_min(); + void size_chineseUnits_max(); + void size_chineseUnits_range(); + void size_noUnit_bytes(); void size_combined_withTime(); void size_combined_withType(); void size_combined_full(); @@ -262,8 +265,8 @@ void tst_ChineseNLP::timePreset_yesterday() void tst_ChineseNLP::timePreset_yesterday_variants() { const QStringList inputs = { QStringLiteral("昨日"), QStringLiteral("昨晚"), - QStringLiteral("昨天上午"), QStringLiteral("昨天下午"), - QStringLiteral("昨天晚上") }; + QStringLiteral("昨天上午"), QStringLiteral("昨天下午"), + QStringLiteral("昨天晚上") }; for (const QString &input : inputs) { ParsedIntent intent; m_parser->parse(input, intent); @@ -283,7 +286,7 @@ void tst_ChineseNLP::timePreset_dayBeforeYesterday() void tst_ChineseNLP::timePreset_thisWeek_variants() { const QStringList inputs = { QStringLiteral("本周"), QStringLiteral("这周"), - QStringLiteral("这个星期"), QStringLiteral("这一个星期") }; + QStringLiteral("这个星期"), QStringLiteral("这一个星期") }; for (const QString &input : inputs) { ParsedIntent intent; m_parser->parse(input, intent); @@ -295,7 +298,7 @@ void tst_ChineseNLP::timePreset_thisWeek_variants() void tst_ChineseNLP::timePreset_lastWeek_variants() { const QStringList inputs = { QStringLiteral("上周"), QStringLiteral("上个星期"), - QStringLiteral("上星期"), QStringLiteral("上一个星期") }; + QStringLiteral("上星期"), QStringLiteral("上一个星期") }; for (const QString &input : inputs) { ParsedIntent intent; m_parser->parse(input, intent); @@ -307,7 +310,7 @@ void tst_ChineseNLP::timePreset_lastWeek_variants() void tst_ChineseNLP::timePreset_thisMonth_variants() { const QStringList inputs = { QStringLiteral("本月"), QStringLiteral("这个月"), - QStringLiteral("当月") }; + QStringLiteral("当月") }; for (const QString &input : inputs) { ParsedIntent intent; m_parser->parse(input, intent); @@ -330,7 +333,7 @@ void tst_ChineseNLP::timePreset_lastMonth_variants() void tst_ChineseNLP::timePreset_thisYear_variants() { const QStringList inputs = { QStringLiteral("今年"), QStringLiteral("本年"), - QStringLiteral("这年") }; + QStringLiteral("这年") }; for (const QString &input : inputs) { ParsedIntent intent; m_parser->parse(input, intent); @@ -409,9 +412,9 @@ void tst_ChineseNLP::fileType_precise_ppt() void tst_ChineseNLP::fileType_category_image_variants() { const QStringList inputs = { QStringLiteral("图片"), QStringLiteral("照片"), - QStringLiteral("截图"), QStringLiteral("壁纸"), - QStringLiteral("海报"), QStringLiteral("相片"), - QStringLiteral("表情包"), QStringLiteral("图") }; + QStringLiteral("截图"), QStringLiteral("壁纸"), + QStringLiteral("海报"), QStringLiteral("相片"), + QStringLiteral("表情包"), QStringLiteral("图") }; const QStringList expectedExts = { "jpg", "jpeg", "png", "gif", "bmp", "webp", "svg" }; for (const QString &input : inputs) { ParsedIntent intent; @@ -424,8 +427,8 @@ void tst_ChineseNLP::fileType_category_image_variants() void tst_ChineseNLP::fileType_category_video_variants() { const QStringList inputs = { QStringLiteral("视频"), QStringLiteral("录像"), - QStringLiteral("电影"), QStringLiteral("动画"), - QStringLiteral("短片"), QStringLiteral("片子") }; + QStringLiteral("电影"), QStringLiteral("动画"), + QStringLiteral("短片"), QStringLiteral("片子") }; const QStringList expectedExts = { "mp4", "avi", "mkv", "mov", "flv", "wmv", "webm" }; for (const QString &input : inputs) { ParsedIntent intent; @@ -438,8 +441,8 @@ void tst_ChineseNLP::fileType_category_video_variants() void tst_ChineseNLP::fileType_category_audio_variants() { const QStringList inputs = { QStringLiteral("音频"), QStringLiteral("音乐"), - QStringLiteral("录音"), QStringLiteral("歌"), - QStringLiteral("语音") }; + QStringLiteral("录音"), QStringLiteral("歌"), + QStringLiteral("语音") }; const QStringList expectedExts = { "mp3", "wav", "flac", "aac", "ogg", "m4a" }; for (const QString &input : inputs) { ParsedIntent intent; @@ -886,13 +889,13 @@ void tst_ChineseNLP::timeCustom_lastYear_extra() void tst_ChineseNLP::fileType_document_general_allSynonyms() { - // Requirements 2.3.2.2.2: 文档, 文件, 报告, 文章, 方案, 文本, 资料, 笔记, 稿件 + // Requirements 2.3.2.2.2: 文档, 报告, 文章, 方案, 文本, 资料, 笔记, 稿件 const QStringList inputs = { - QStringLiteral("文档"), QStringLiteral("文件"), QStringLiteral("报告"), + QStringLiteral("文档"), QStringLiteral("报告"), QStringLiteral("文章"), QStringLiteral("方案"), QStringLiteral("文本"), QStringLiteral("资料"), QStringLiteral("笔记"), QStringLiteral("稿件") }; - const QStringList expectedExts = {"doc", "docx", "pdf", "txt", "wps", "rtf", "md", "odt"}; + const QStringList expectedExts = { "doc", "docx", "pdf", "txt", "wps", "rtf", "md", "odt" }; for (const QString &input : inputs) { ParsedIntent intent; m_parser->parse(input, intent); @@ -947,7 +950,7 @@ void tst_ChineseNLP::fileType_image_allSynonyms() QStringLiteral("图"), QStringLiteral("壁纸"), QStringLiteral("海报"), QStringLiteral("相片"), QStringLiteral("表情包") }; - const QStringList expectedExts = {"jpg", "jpeg", "png", "gif", "bmp", "webp", "svg"}; + const QStringList expectedExts = { "jpg", "jpeg", "png", "gif", "bmp", "webp", "svg" }; for (const QString &input : inputs) { ParsedIntent intent; m_parser->parse(input, intent); @@ -963,7 +966,7 @@ void tst_ChineseNLP::fileType_video_allSynonyms() QStringLiteral("视频"), QStringLiteral("录像"), QStringLiteral("电影"), QStringLiteral("动画"), QStringLiteral("短片"), QStringLiteral("片子") }; - const QStringList expectedExts = {"mp4", "avi", "mkv", "mov", "flv", "wmv", "webm"}; + const QStringList expectedExts = { "mp4", "avi", "mkv", "mov", "flv", "wmv", "webm" }; for (const QString &input : inputs) { ParsedIntent intent; m_parser->parse(input, intent); @@ -979,7 +982,7 @@ void tst_ChineseNLP::fileType_audio_allSynonyms() QStringLiteral("音频"), QStringLiteral("音乐"), QStringLiteral("录音"), QStringLiteral("歌"), QStringLiteral("语音") }; - const QStringList expectedExts = {"mp3", "wav", "flac", "aac", "ogg", "m4a"}; + const QStringList expectedExts = { "mp3", "wav", "flac", "aac", "ogg", "m4a" }; for (const QString &input : inputs) { ParsedIntent intent; m_parser->parse(input, intent); @@ -1087,14 +1090,14 @@ void tst_ChineseNLP::size_fuzzy_large() ParsedIntent intent; m_parser->parse(QStringLiteral("大文件"), intent); QVERIFY(intent.sizeConstraint.isValid()); - QCOMPARE(intent.sizeConstraint.minSize, 524288000LL); // 500MB - QCOMPARE(intent.sizeConstraint.maxSize, 0LL); // no upper bound + QCOMPARE(intent.sizeConstraint.minSize, 524288000LL); // 500MB + QCOMPARE(intent.sizeConstraint.maxSize, 0LL); // no upper bound } void tst_ChineseNLP::size_fuzzy_large_synonyms() { const QStringList inputs = { QStringLiteral("很大的"), QStringLiteral("占空间的"), - QStringLiteral("几个G的") }; + QStringLiteral("几个G的") }; for (const QString &input : inputs) { ParsedIntent intent; m_parser->parse(input + QStringLiteral("的图片"), intent); @@ -1109,7 +1112,7 @@ void tst_ChineseNLP::size_fuzzy_small() m_parser->parse(QStringLiteral("小文件"), intent); QVERIFY(intent.sizeConstraint.isValid()); QCOMPARE(intent.sizeConstraint.minSize, 0LL); - QCOMPARE(intent.sizeConstraint.maxSize, 1048576LL); // 1MB + QCOMPARE(intent.sizeConstraint.maxSize, 1048576LL); // 1MB QCOMPARE(intent.sizeConstraint.includeUpper, false); } @@ -1118,7 +1121,7 @@ void tst_ChineseNLP::size_dynamic_min() ParsedIntent intent; m_parser->parse(QStringLiteral("大于500M的文档"), intent); QVERIFY(intent.sizeConstraint.isValid()); - QCOMPARE(intent.sizeConstraint.minSize, 524288000LL); // 500MB + QCOMPARE(intent.sizeConstraint.minSize, 524288000LL); // 500MB QVERIFY(intent.sizeConstraint.includeLower); } @@ -1127,7 +1130,7 @@ void tst_ChineseNLP::size_dynamic_max() ParsedIntent intent; m_parser->parse(QStringLiteral("小于100K的文件"), intent); QVERIFY(intent.sizeConstraint.isValid()); - QCOMPARE(intent.sizeConstraint.maxSize, 102400LL); // 100KB + QCOMPARE(intent.sizeConstraint.maxSize, 102400LL); // 100KB QCOMPARE(intent.sizeConstraint.minSize, 0LL); } @@ -1137,7 +1140,42 @@ void tst_ChineseNLP::size_dynamic_between() m_parser->parse(QStringLiteral("1M-10M的文件"), intent); QVERIFY(intent.sizeConstraint.isValid()); QCOMPARE(intent.sizeConstraint.minSize, 1048576LL); // 1MB - QCOMPARE(intent.sizeConstraint.maxSize, 10485760LL); // 10MB + QCOMPARE(intent.sizeConstraint.maxSize, 10485760LL); // 10MB +} + +void tst_ChineseNLP::size_chineseUnits_min() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("大于100兆的文件"), intent); + QVERIFY(intent.sizeConstraint.isValid()); + QCOMPARE(intent.sizeConstraint.minSize, 104857600LL); // 100MB + QVERIFY(intent.sizeConstraint.includeLower); +} + +void tst_ChineseNLP::size_chineseUnits_max() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("小于50兆的图片"), intent); + QVERIFY(intent.sizeConstraint.isValid()); + QCOMPARE(intent.sizeConstraint.maxSize, 52428800LL); // 50MB +} + +void tst_ChineseNLP::size_chineseUnits_range() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("找下大小在1兆到10兆的文件"), intent); + QVERIFY(intent.sizeConstraint.isValid()); + QVERIFY(intent.keywords.isEmpty()); + QCOMPARE(intent.sizeConstraint.minSize, 1048576LL); // 1MB + QCOMPARE(intent.sizeConstraint.maxSize, 10485760LL); // 10MB +} + +void tst_ChineseNLP::size_noUnit_bytes() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("小于1024的文件"), intent); + QVERIFY(intent.sizeConstraint.isValid()); + QCOMPARE(intent.sizeConstraint.maxSize, 1024LL); // raw bytes } void tst_ChineseNLP::size_combined_withTime() @@ -1165,7 +1203,7 @@ void tst_ChineseNLP::size_combined_full() QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); QCOMPARE(intent.timeConstraint.preset, TimePreset::Yesterday); QVERIFY(intent.sizeConstraint.isValid()); - QCOMPARE(intent.sizeConstraint.minSize, 104857600LL); // 100MB + QCOMPARE(intent.sizeConstraint.minSize, 104857600LL); // 100MB QVERIFY(intent.fileExtensions.contains("jpg")); QVERIFY(intent.fileExtensions.contains("mp4")); } @@ -1189,7 +1227,7 @@ void tst_ChineseNLP::timeRelative_justNow() void tst_ChineseNLP::timeRelative_justNow_synonyms() { const QStringList inputs = { QStringLiteral("刚才"), QStringLiteral("刚"), - QStringLiteral("这会儿") }; + QStringLiteral("这会儿") }; for (const QString &input : inputs) { ParsedIntent intent; m_parser->parse(input + QStringLiteral("的文档"), intent); @@ -1212,7 +1250,7 @@ void tst_ChineseNLP::timeRelative_recentDays() void tst_ChineseNLP::timeRelative_recentDays_synonyms() { const QStringList inputs = { QStringLiteral("这几天"), QStringLiteral("近期"), - QStringLiteral("这阵子") }; + QStringLiteral("这阵子") }; for (const QString &input : inputs) { ParsedIntent intent; m_parser->parse(input + QStringLiteral("的文件"), intent); @@ -1295,7 +1333,7 @@ void tst_ChineseNLP::action_create_birthTime() void tst_ChineseNLP::action_create_synonyms() { const QStringList inputs = { QStringLiteral("创建的文档"), QStringLiteral("存下来的图片"), - QStringLiteral("保存的文件"), QStringLiteral("新加的视频") }; + QStringLiteral("保存的文件"), QStringLiteral("新加的视频") }; for (const QString &input : inputs) { ParsedIntent intent; m_parser->parse(input, intent); @@ -1322,7 +1360,7 @@ void tst_ChineseNLP::action_modify_modifyTime() void tst_ChineseNLP::action_modify_synonyms() { const QStringList inputs = { QStringLiteral("编辑过的文档"), QStringLiteral("改过的文件"), - QStringLiteral("写过的图片"), QStringLiteral("更新的视频") }; + QStringLiteral("写过的图片"), QStringLiteral("更新的视频") }; for (const QString &input : inputs) { ParsedIntent intent; m_parser->parse(input, intent); @@ -1437,7 +1475,7 @@ void tst_ChineseNLP::location_trash() // "回收站里的文件" → location(trash) + includeHidden + filetype(文件=文档类) const QString trashPath = QDir::homePath() + "/.local/share/Trash/files"; ParsedIntent intent; - m_parser->parse(QStringLiteral("回收站里的文件"), intent); + m_parser->parse(QStringLiteral("回收站里的文档"), intent); QCOMPARE(intent.searchDirectories.size(), 1); QCOMPARE(intent.searchDirectories.first(), trashPath); QVERIFY(intent.includeHidden); diff --git a/autotests/dfm-search-tests/tst_semantic_search.cpp b/autotests/dfm-search-tests/tst_semantic_search.cpp index 56de2e3a..a9aa384f 100644 --- a/autotests/dfm-search-tests/tst_semantic_search.cpp +++ b/autotests/dfm-search-tests/tst_semantic_search.cpp @@ -676,12 +676,8 @@ QString sourceRulesDir() bool sourceRulesAvailable() { const QString dir = sourceRulesDir(); - for (const QString &filename : RuleConfigLoader::ruleFileNames()) { - if (!QFile::exists(dir + "/" + filename)) { - return false; - } - } - return true; + return QDir(dir).exists() + && !QDir(dir).entryList({"*.json"}, QDir::Files).isEmpty(); } // Replicate isSemanticQuery() logic using internal components with source-tree rules. @@ -745,13 +741,9 @@ void tst_IsSemanticQuery::initTestCase() m_engine = new SemanticRuleEngine(this); const QString dir = sourceRulesDir(); - QStringList ruleFiles = RuleConfigLoader::ruleFileNames(); - // size_rules.json is not yet registered in RuleConfigLoader::ruleFileNames(), - // but the IntentParser includes a SizeExtractor. Load it explicitly. - if (!ruleFiles.contains("size_rules.json")) { - ruleFiles.append("size_rules.json"); - } - for (const QString &filename : std::as_const(ruleFiles)) { + const QStringList ruleFiles = QDir(dir).entryList( + {"*.json"}, QDir::Files, QDir::Name); + for (const QString &filename : ruleFiles) { QString path = dir + "/" + filename; if (!m_engine->loadRuleFile(path)) { qWarning() << "Failed to load rule file:" << path; diff --git a/src/dfm-search/dfm-search-lib/semantic/extractors/sizeextractor.cpp b/src/dfm-search/dfm-search-lib/semantic/extractors/sizeextractor.cpp index 46794f27..87a1dc72 100644 --- a/src/dfm-search/dfm-search-lib/semantic/extractors/sizeextractor.cpp +++ b/src/dfm-search/dfm-search-lib/semantic/extractors/sizeextractor.cpp @@ -47,7 +47,7 @@ void SizeExtractor::extract(const QString &input, ParsedIntent &intent) if (direction == "min") { const QString value = match.captured("value"); - const QString unit = match.captured("unit"); + const QString unit = normalizeUnit(match.captured("unit"), metadata); qint64 bytes = parseSizeToBytes(value, unit); if (bytes <= 0) { return; @@ -56,7 +56,7 @@ void SizeExtractor::extract(const QString &input, ParsedIntent &intent) sc.includeLower = true; } else if (direction == "max") { const QString value = match.captured("value"); - const QString unit = match.captured("unit"); + const QString unit = normalizeUnit(match.captured("unit"), metadata); qint64 bytes = parseSizeToBytes(value, unit); if (bytes <= 0) { return; @@ -65,9 +65,9 @@ void SizeExtractor::extract(const QString &input, ParsedIntent &intent) sc.includeUpper = true; } else if (direction == "range") { const QString minVal = match.captured("min_val"); - const QString minUnit = match.captured("min_unit"); + const QString minUnit = normalizeUnit(match.captured("min_unit"), metadata); const QString maxVal = match.captured("max_val"); - const QString maxUnit = match.captured("max_unit"); + const QString maxUnit = normalizeUnit(match.captured("max_unit"), metadata); qint64 minBytes = parseSizeToBytes(minVal, minUnit); qint64 maxBytes = parseSizeToBytes(maxVal, maxUnit); if (minBytes <= 0 || maxBytes <= 0) { @@ -88,6 +88,23 @@ void SizeExtractor::extract(const QString &input, ParsedIntent &intent) } } +QString SizeExtractor::normalizeUnit(const QString &rawUnit, const QVariantMap &metadata) +{ + if (rawUnit.isEmpty()) { + return {}; + } + + const QVariantMap unitMap = metadata.value("unit_map").toMap(); + if (!unitMap.isEmpty()) { + const QString mapped = unitMap.value(rawUnit).toString(); + if (!mapped.isEmpty()) { + return mapped; + } + } + + return rawUnit; +} + qint64 SizeExtractor::parseSizeToBytes(const QString &value, const QString &unit) { bool ok = false; @@ -96,21 +113,17 @@ qint64 SizeExtractor::parseSizeToBytes(const QString &value, const QString &unit return -1; } - QString upperUnit = unit.toUpper(); - if (upperUnit.isEmpty()) { - // No unit: assume bytes - return static_cast(num); - } - if (upperUnit == "B" || upperUnit == "BB") { + const QString u = unit.toUpper(); + if (u.isEmpty() || u == "B" || u == "BB") { return static_cast(num); } - if (upperUnit == "K" || upperUnit == "KB") { + if (u == "K" || u == "KB") { return static_cast(num * 1024); } - if (upperUnit == "M" || upperUnit == "MB") { + if (u == "M" || u == "MB") { return static_cast(num * 1024 * 1024); } - if (upperUnit == "G" || upperUnit == "GB") { + if (u == "G" || u == "GB") { return static_cast(num * 1024 * 1024 * 1024); } diff --git a/src/dfm-search/dfm-search-lib/semantic/extractors/sizeextractor.h b/src/dfm-search/dfm-search-lib/semantic/extractors/sizeextractor.h index d87a002a..2946d3a7 100644 --- a/src/dfm-search/dfm-search-lib/semantic/extractors/sizeextractor.h +++ b/src/dfm-search/dfm-search-lib/semantic/extractors/sizeextractor.h @@ -22,6 +22,7 @@ class SizeExtractor : public DimensionExtractor private: static qint64 parseSizeToBytes(const QString &value, const QString &unit); + static QString normalizeUnit(const QString &rawUnit, const QVariantMap &metadata); SemanticRuleEngine *m_engine; }; diff --git a/src/dfm-search/dfm-search-lib/semantic/ruleconfigloader.cpp b/src/dfm-search/dfm-search-lib/semantic/ruleconfigloader.cpp index bcf3baf3..e7372d42 100644 --- a/src/dfm-search/dfm-search-lib/semantic/ruleconfigloader.cpp +++ b/src/dfm-search/dfm-search-lib/semantic/ruleconfigloader.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include DFM_SEARCH_BEGIN_NS @@ -35,17 +36,17 @@ QString RuleConfigLoader::systemRulesDir() { return QDir(QDir(QLatin1String(kInstallPrefix)) .absoluteFilePath(QLatin1String("share/deepin/") - + libName() - + "/semantic/rules")) + + libName() + + "/semantic/rules")) .absolutePath(); } QString RuleConfigLoader::userRulesDir() { return QDir(QStandardPaths::writableLocation(QStandardPaths::GenericConfigLocation) - + "/deepin/" - + libName() - + "/semantic/rules") + + "/deepin/" + + libName() + + "/semantic/rules") .absolutePath(); } @@ -54,13 +55,28 @@ QString RuleConfigLoader::currentLocaleName() return QLocale::system().name().simplified(); } -QStringList RuleConfigLoader::ruleFileNames() +QStringList RuleConfigLoader::ruleFilePaths() { - return {"noise_rules.json", - "time_rules.json", - "filetype_rules.json", - "keyword_rules.json", - "location_rules.json"}; + QStringList paths; + QSet seen; // deduplicate by filename + + const QStringList dirs { resolveLocaleDir(userRulesDir()), + resolveLocaleDir(systemRulesDir()) }; + + for (const QString &dir : dirs) { + const QStringList files = QDir(dir).entryList( + QStringList { QStringLiteral("*.json") }, + QDir::Files | QDir::Readable); + for (const QString &filename : files) { + const QString absPath = QDir(dir).absoluteFilePath(filename); + if (!seen.contains(filename) && validateRuleFile(absPath)) { + paths.append(absPath); + seen.insert(filename); + } + } + } + + return paths; } QString RuleConfigLoader::resolveLocaleDir(const QString &baseDir) diff --git a/src/dfm-search/dfm-search-lib/semantic/ruleconfigloader.h b/src/dfm-search/dfm-search-lib/semantic/ruleconfigloader.h index 82c6656f..d03bf3b9 100644 --- a/src/dfm-search/dfm-search-lib/semantic/ruleconfigloader.h +++ b/src/dfm-search/dfm-search-lib/semantic/ruleconfigloader.h @@ -50,12 +50,15 @@ class RuleConfigLoader static QString currentLocaleName(); /** - * @brief Get the list of expected rule file names. + * @brief Scan locale directories and return resolved paths for all rule files. + * Scans user dir first, then system dir; user files take priority. + * Falls back through locale chain: zh_CN -> zh -> zh_CN (default). + * @return Deduplicated list of resolved absolute paths */ - static QStringList ruleFileNames(); + static QStringList ruleFilePaths(); /** - * @brief Resolve the effective path for a rule file. + * @brief Resolve the effective path for a single rule file. * Checks user dir first, then system dir, with locale subdirectory lookup. * Falls back to zh_CN if the current locale directory is not found. * @param filename The rule file name (e.g., "time_rules.json") diff --git a/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/filetype_rules.json b/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/filetype_rules.json index 967da828..ac471fbf 100644 --- a/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/filetype_rules.json +++ b/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/filetype_rules.json @@ -50,7 +50,7 @@ }, { "id": "filetype_document_general", - "pattern": "文档|文件|报告|文章|方案|文本|资料|笔记|稿件", + "pattern": "文档|报告|文章|方案|文本|资料|笔记|稿件", "description": "Generic documents (fallback)", "enabled": true, "priority": 100, diff --git a/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/noise_rules.json b/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/noise_rules.json index ce0a2d70..c469a2e2 100644 --- a/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/noise_rules.json +++ b/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/noise_rules.json @@ -8,7 +8,7 @@ "rules": [ { "id": "noise_action", - "pattern": "帮我找到|搜索|查找|找一下|搜一下|查一下|帮我搜|帮我找", + "pattern": "帮我找到|帮我搜|帮我找|搜索|查找|找一下|搜一下|查一下|找下|搜下|查下|找找|搜搜", "description": "Search action words to consume", "enabled": true, "priority": 100, @@ -22,6 +22,14 @@ "priority": 90, "metadata": {} }, + { + "id": "noise_size_lead", + "pattern": "大小在|大小为|大小是|体积在|体积为|体积是|容量在|容量为|容量是|大小不超过|大小超过|大小不到|大小最多|大小最少|大小至少", + "description": "Size constraint lead-in words to consume", + "enabled": true, + "priority": 140, + "metadata": {} + }, { "id": "noise_polite", "pattern": "请|麻烦|谢谢|帮我", diff --git a/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/size_rules.json b/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/size_rules.json index 354ad260..37e4a4bf 100644 --- a/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/size_rules.json +++ b/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/size_rules.json @@ -33,35 +33,38 @@ }, { "id": "size_dynamic", - "pattern": "(大于|超过|最少|至少|>)\\s*(?\\d+(?:\\.\\d+)?)\\s*(?[KkMmGg][Bb]?)以?[上内]?", - "description": "Dynamic precise size (e.g. 大于500M, 1G以上)", + "pattern": "(大于|超过|最少|至少|>)\\s*(?\\d+(?:\\.\\d+)?)\\s*(?[KkMmGg][Bb]?|兆|字节|千|万)?以?[上内]?", + "description": "Dynamic precise size min (e.g. 大于500M, 1G以上, 大于100兆)", "enabled": true, "priority": 150, "metadata": { "type": "dynamic", - "direction": "min" + "direction": "min", + "unit_map": {"兆": "M", "字节": "B", "千": "K", "万": "K"} } }, { "id": "size_dynamic_less", - "pattern": "(小于|不超过|不到|最多|<)\\s*(?\\d+(?:\\.\\d+)?)\\s*(?[KkMmGg][Bb]?)以?[下内]?", - "description": "Dynamic precise size less-than (e.g. 小于100K, 不到1G)", + "pattern": "(小于|不超过|不到|最多|<)\\s*(?\\d+(?:\\.\\d+)?)\\s*(?[KkMmGg][Bb]?|兆|字节|千|万)?以?[下内]?", + "description": "Dynamic precise size max (e.g. 小于100K, 不到1G, 小于100兆)", "enabled": true, "priority": 150, "metadata": { "type": "dynamic", - "direction": "max" + "direction": "max", + "unit_map": {"兆": "M", "字节": "B", "千": "K", "万": "K"} } }, { "id": "size_dynamic_between", - "pattern": "(?\\d+(?:\\.\\d+)?)\\s*(?[KkMmGg][Bb]?)[\\s~\\-到至]+(?\\d+(?:\\.\\d+)?)\\s*(?[KkMmGg][Bb]?)", - "description": "Size range (e.g. 1M-10M, 100K到500K)", + "pattern": "(?\\d+(?:\\.\\d+)?)\\s*(?[KkMmGg][Bb]?|兆|字节|千|万)?[\\s~\\-到至]+(?\\d+(?:\\.\\d+)?)\\s*(?[KkMmGg][Bb]?|兆|字节|千|万)?", + "description": "Size range (e.g. 1M-10M, 100K到500K, 1兆到10兆)", "enabled": true, "priority": 150, "metadata": { "type": "dynamic", - "direction": "range" + "direction": "range", + "unit_map": {"兆": "M", "字节": "B", "千": "K", "万": "K"} } } ] diff --git a/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.cpp b/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.cpp index 65d074cf..0e817b10 100644 --- a/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.cpp +++ b/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.cpp @@ -43,8 +43,6 @@ SemanticSearchPlan SemanticQueryBuilder::build(const ParsedIntent &intent) { SearchOptions opts = baseOpts; FileNameOptionsAPI fnameApi(opts); - fnameApi.setPinyinEnabled(true); - fnameApi.setPinyinAcronymEnabled(true); if (!intent.fileExtensions.isEmpty()) { fnameApi.setFileExtensions(intent.fileExtensions); @@ -56,7 +54,7 @@ SemanticSearchPlan SemanticQueryBuilder::build(const ParsedIntent &intent) plan.fileNameQuery = SearchFactory::createQuery(intent.keywords, SearchQuery::Type::Boolean); } else { // No keywords: search all files (use wildcard to match everything) - plan.fileNameQuery = SearchFactory::createQuery(QStringLiteral("*")); + plan.fileNameQuery = SearchFactory::createQuery(""); } plan.fileNameOptions = opts; diff --git a/src/dfm-search/dfm-search-lib/semantic/semanticruleengine.cpp b/src/dfm-search/dfm-search-lib/semantic/semanticruleengine.cpp index 1db5fa30..8980e5a6 100644 --- a/src/dfm-search/dfm-search-lib/semantic/semanticruleengine.cpp +++ b/src/dfm-search/dfm-search-lib/semantic/semanticruleengine.cpp @@ -8,27 +8,12 @@ #include #include #include -#include DFM_SEARCH_BEGIN_NS SemanticRuleEngine::SemanticRuleEngine(QObject *parent) : QObject(parent) - , m_watcher(new QFileSystemWatcher(this)) - , m_reloadTimer(new QTimer(this)) { - m_reloadTimer->setSingleShot(true); - m_reloadTimer->setInterval(100); - - QObject::connect(m_reloadTimer, &QTimer::timeout, this, [this]() { - loadRules(); - Q_EMIT rulesReloaded(); - }); - - QObject::connect(m_watcher, &QFileSystemWatcher::fileChanged, - this, [this](const QString &) { - m_reloadTimer->start(); - }); } SemanticRuleEngine::~SemanticRuleEngine() = default; @@ -36,15 +21,8 @@ SemanticRuleEngine::~SemanticRuleEngine() = default; bool SemanticRuleEngine::loadRules() { QMap newGroups; - QStringList watchedFiles; - - for (const QString &filename : RuleConfigLoader::ruleFileNames()) { - const QString path = RuleConfigLoader::resolveRulePath(filename); - if (path.isEmpty()) { - qWarning() << "Rule file not found:" << filename; - continue; - } + for (const QString &path : RuleConfigLoader::ruleFilePaths()) { QList loaded; if (!RuleConfigLoader::loadRuleFile(path, loaded)) { qWarning() << "Failed to load rule file:" << path; @@ -73,29 +51,16 @@ bool SemanticRuleEngine::loadRules() } m_ruleFilePaths.insert(group.name, path); - if (!watchedFiles.contains(path)) { - watchedFiles.append(path); - } } } if (newGroups.isEmpty()) { - qWarning() << "No rule files loaded, keeping cached rules"; + qWarning() << "No rule files loaded, keeping existing rules"; return !m_groups.isEmpty(); } - // Cache valid rules for rollback - m_cachedGroups = m_groups.isEmpty() ? newGroups : m_groups; m_groups = newGroups; - // Update file watcher - if (!m_watcher->files().isEmpty()) { - m_watcher->removePaths(m_watcher->files()); - } - for (const QString &f : watchedFiles) { - m_watcher->addPath(f); - } - return true; } @@ -276,10 +241,4 @@ bool SemanticRuleEngine::parseRuleGroupStatic(const QJsonObject &groupObj, RuleG return !outGroup.rules.isEmpty(); } -void SemanticRuleEngine::onRuleFilesChanged(const QStringList &files) -{ - Q_UNUSED(files); - m_reloadTimer->start(); -} - DFM_SEARCH_END_NS diff --git a/src/dfm-search/dfm-search-lib/semantic/semanticruleengine.h b/src/dfm-search/dfm-search-lib/semantic/semanticruleengine.h index ecf95a3e..6f172e6b 100644 --- a/src/dfm-search/dfm-search-lib/semantic/semanticruleengine.h +++ b/src/dfm-search/dfm-search-lib/semantic/semanticruleengine.h @@ -5,13 +5,11 @@ #ifndef SEMANTICRULEENGINE_H #define SEMANTICRULEENGINE_H -#include #include #include #include #include #include -#include #include @@ -38,7 +36,6 @@ struct RuleGroup { * @brief Rule engine that loads regex rules from JSON config files. * * Provides match/matchAll operations with priority-based ordering. - * Supports hot-reload via QFileSystemWatcher. */ class SemanticRuleEngine : public QObject { @@ -109,21 +106,11 @@ class SemanticRuleEngine : public QObject */ static bool parseRuleGroupStatic(const QJsonObject &groupObj, RuleGroup &outGroup); -Q_SIGNALS: - void rulesReloaded(); - -private Q_SLOTS: - void onRuleFilesChanged(const QStringList &files); - private: bool parseRuleGroup(const QJsonObject &groupObj, RuleGroup &outGroup); - void startWatching(); QMap m_groups; - QMap m_cachedGroups; // last valid rules for rollback QMap m_ruleFilePaths; // group name -> resolved file path - QFileSystemWatcher *m_watcher = nullptr; - QTimer *m_reloadTimer = nullptr; }; DFM_SEARCH_END_NS From 54bfe85e65e474704815128553095f66405692b8 Mon Sep 17 00:00:00 2001 From: Zhang Sheng Date: Fri, 15 May 2026 17:31:41 +0800 Subject: [PATCH 14/36] feat: add semantic search with detailed results MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Added SearchType::Semantic enum value for semantic/natural-language search 2. Implemented detailed results control in SemanticSearcher with setDetailedResultsEnabled() 3. Modified search engines to always use path prefix query optimization for faster results 4. Changed semantic search to collect all results at once instead of streaming for consistency 5. Updated JSON and text output formatters to handle semantic search results with custom attributes Log: Added semantic search type and detailed results configuration Influence: 1. Test semantic search with various natural language queries 2. Verify detailed results contain all expected metadata when enabled 3. Check JSON output format for semantic search results 4. Verify path prefix optimization works for all search paths 5. Test performance impact of detailed results collection 6. Verify semantic search works with existing filters (time, size) feat: 添加语义搜索与详细结果功能 1. 新增SearchType::Semantic枚举值用于语义/自然语言搜索 2. 在SemanticSearcher中实现详细结果控制setDetailedResultsEnabled() 3. 修改搜索引擎始终使用路径前缀查询优化以获得更快的搜索结果 4. 将语义搜索改为一次性收集所有结果而非流式传输以保证一致性 5. 更新JSON和文本输出格式化器以处理带有自定义属性的语义搜索结果 Log: 新增语义搜索类型和详细结果配置功能 Influence: 1. 测试不同自然语言查询的语义搜索功能 2. 验证启用详细结果时包含所有预期元数据 3. 检查JSON输出格式是否符合语义搜索结果 4. 确认路径前缀优化对所有搜索路径有效 5. 测试详细结果收集对性能的影响 6. 验证语义搜索与现有筛选器(时间、大小)的兼容性 --- .../dfm-search/dfm-search/dsearch_global.h | 1 + .../dfm-search/dfm-search/semanticsearcher.h | 16 ++++++++ src/dfm-search/dfm-search-client/main.cpp | 25 +++++++++--- .../dfm-search-client/output/json_output.cpp | 17 ++++++++ .../dfm-search-client/output/text_output.cpp | 9 +++++ .../contentstrategies/indexedstrategy.cpp | 27 +++++-------- .../filenamestrategies/indexedstrategy.cpp | 21 +++------- .../ocrtextstrategies/indexedstrategy.cpp | 29 +++++--------- .../semantic/semanticsearcher.cpp | 40 ++++++++++++------- .../semantic/semanticsearcher_p.h | 7 ++-- .../dfm-search-lib/utils/searchutility.cpp | 22 ---------- .../dfm-search-lib/utils/searchutility.h | 7 ---- 12 files changed, 117 insertions(+), 104 deletions(-) diff --git a/include/dfm-search/dfm-search/dsearch_global.h b/include/dfm-search/dfm-search/dsearch_global.h index cfe05523..f03a064b 100644 --- a/include/dfm-search/dfm-search/dsearch_global.h +++ b/include/dfm-search/dfm-search/dsearch_global.h @@ -203,6 +203,7 @@ enum SearchType { FileName, // Search by file name Content, // Search by content within files Ocr, // Search by OCR-extracted text from images + Semantic = 40, // Semantic / natural-language search (launches sub-engines internally) Custom = 50 // User-defined search type }; Q_ENUM_NS(SearchType) diff --git a/include/dfm-search/dfm-search/semanticsearcher.h b/include/dfm-search/dfm-search/semanticsearcher.h index 8c4c85c9..ac39df6e 100644 --- a/include/dfm-search/dfm-search/semanticsearcher.h +++ b/include/dfm-search/dfm-search/semanticsearcher.h @@ -100,6 +100,22 @@ class SemanticSearcher : public QObject */ void cancel(); + /** + * @brief Enable or disable detailed results for sub-engines + * + * When enabled, each sub-engine (FileName, Content, OCR) will populate + * extra metadata fields (file type, size, timestamps, etc.) in results. + * Must be called before search(). + * + * @param enable true to enable detailed results (default false) + */ + void setDetailedResultsEnabled(bool enable); + + /** + * @brief Check whether detailed results are enabled + */ + bool isDetailedResultsEnabled() const; + Q_SIGNALS: /** * @brief Emitted when a search operation starts diff --git a/src/dfm-search/dfm-search-client/main.cpp b/src/dfm-search/dfm-search-client/main.cpp index 202d9976..c363af7b 100644 --- a/src/dfm-search/dfm-search-client/main.cpp +++ b/src/dfm-search/dfm-search-client/main.cpp @@ -165,20 +165,33 @@ int main(int argc, char *argv[]) // Semantic search mode if (config.semanticMode) { auto *semanticSearcher = new DFMSEARCH::SemanticSearcher(&app); + semanticSearcher->setDetailedResultsEnabled(config.verbose); OutputFormatter *formatter = createOutputFormatter(config, &app); + + // 为语义搜索构建 formatter 需要的 options + SearchOptions formatterOptions; + formatterOptions.setDetailedResultsEnabled(config.verbose); + if (config.hasTimeFilter) formatterOptions.setTimeRangeFilter(config.timeFilter); + if (config.hasSizeFilter) formatterOptions.setSizeRangeFilter(config.sizeFilter); + + JsonOutput *jsonOutput = qobject_cast(formatter); + if (jsonOutput) { + jsonOutput->setSearchOptions(formatterOptions); + } + TextOutput *textOutput = qobject_cast(formatter); + if (textOutput) { + textOutput->setSearchOptions(formatterOptions); + textOutput->setVerbose(config.verbose); + } + formatter->setSearchContext(config.keyword, config.searchPath, - SearchType::FileName, SearchMethod::Indexed); + SearchType::Semantic, SearchMethod::Indexed); QObject::connect(formatter, &OutputFormatter::finished, &app, &QCoreApplication::quit); QObject::connect(semanticSearcher, &DFMSEARCH::SemanticSearcher::searchStarted, [formatter]() { formatter->outputSearchStarted(); }); - QObject::connect(semanticSearcher, &DFMSEARCH::SemanticSearcher::resultsFound, [formatter](const SearchResultList &results) { - for (const auto &result : results) { - formatter->outputResult(result); - } - }); QObject::connect(semanticSearcher, &DFMSEARCH::SemanticSearcher::searchFinished, [formatter](const SearchResultList &results) { formatter->outputSearchFinished(results); }); diff --git a/src/dfm-search/dfm-search-client/output/json_output.cpp b/src/dfm-search/dfm-search-client/output/json_output.cpp index e3837ec3..2b3e6482 100644 --- a/src/dfm-search/dfm-search-client/output/json_output.cpp +++ b/src/dfm-search/dfm-search-client/output/json_output.cpp @@ -170,6 +170,17 @@ QJsonValue JsonOutput::resultToJson(const SearchResult &result) obj["sizeBytes"] = sizeBytes; } + return obj; + } else if (m_searchType == SearchType::Semantic) { + // 语义搜索:结果来自多个子引擎,通用输出所有 customAttributes + QJsonObject obj; + obj["path"] = result.path(); + + const QVariantMap attrs = result.customAttributes(); + for (auto it = attrs.cbegin(); it != attrs.cend(); ++it) { + obj[it.key()] = QJsonValue::fromVariant(it.value()); + } + return obj; } return result.path(); @@ -208,6 +219,9 @@ void JsonOutput::outputStreamingStart() case SearchType::Ocr: searchTypeStr = "ocr"; break; + case SearchType::Semantic: + searchTypeStr = "semantic"; + break; default: searchTypeStr = "unknown"; } @@ -320,6 +334,9 @@ void JsonOutput::outputCompleteResult(const QList &results) case SearchType::Ocr: searchTypeStr = "ocr"; break; + case SearchType::Semantic: + searchTypeStr = "semantic"; + break; default: searchTypeStr = "unknown"; } diff --git a/src/dfm-search/dfm-search-client/output/text_output.cpp b/src/dfm-search/dfm-search-client/output/text_output.cpp index a9f269ad..0edc37c9 100644 --- a/src/dfm-search/dfm-search-client/output/text_output.cpp +++ b/src/dfm-search/dfm-search-client/output/text_output.cpp @@ -38,6 +38,8 @@ void TextOutput::outputSearchStarted() typeStr = "Content"; else if (m_searchType == SearchType::Ocr) typeStr = "Ocr"; + else if (m_searchType == SearchType::Semantic) + typeStr = "Semantic"; std::cout << "Search type: " << typeStr.toStdString() << std::endl; std::cout << "Search method: " << (m_searchMethod == SearchMethod::Indexed ? "Indexed" : "Realtime") << std::endl; @@ -200,6 +202,13 @@ void TextOutput::printSearchResult(const SearchResult &result) if (sizeBytes > 0) { std::cout << " Size: " << sizeBytes << " bytes" << std::endl; } + } else if (m_searchType == SearchType::Semantic) { + // 语义搜索:通用输出所有 customAttributes + const QVariantMap attrs = result.customAttributes(); + for (auto it = attrs.cbegin(); it != attrs.cend(); ++it) { + std::cout << " " << it.key().toStdString() << ": " + << it.value().toString().toStdString() << std::endl; + } } std::cout << std::endl; diff --git a/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.cpp b/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.cpp index 5e8642d8..f3fc4200 100644 --- a/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.cpp +++ b/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.cpp @@ -103,24 +103,15 @@ Lucene::QueryPtr ContentIndexedStrategy::buildLuceneQuery(const SearchQuery &que // Add path prefix query optimization QStringList searchPathsList = m_options.searchPaths(); if (mainQuery && SearchUtility::isContentIndexAncestorPathsSupported()) { - bool usePrefixQuery = false; - for (const QString &p : searchPathsList) { - if (SearchUtility::shouldUsePathPrefixQuery(p)) { - usePrefixQuery = true; - break; - } - } - if (usePrefixQuery) { - QueryPtr pathPrefixQuery = LuceneQueryUtils::buildMultiPathPrefixQuery( - searchPathsList, - QString::fromWCharArray(LuceneFieldNames::Content::kAncestorPaths)); - if (pathPrefixQuery) { - BooleanQueryPtr finalQuery = newLucene(); - finalQuery->add(mainQuery, BooleanClause::MUST); - finalQuery->add(pathPrefixQuery, BooleanClause::MUST); - qInfo() << "Using multi-path prefix query for content search optimization:" << searchPathsList; - mainQuery = finalQuery; - } + QueryPtr pathPrefixQuery = LuceneQueryUtils::buildMultiPathPrefixQuery( + searchPathsList, + QString::fromWCharArray(LuceneFieldNames::Content::kAncestorPaths)); + if (pathPrefixQuery) { + BooleanQueryPtr finalQuery = newLucene(); + finalQuery->add(mainQuery, BooleanClause::MUST); + finalQuery->add(pathPrefixQuery, BooleanClause::MUST); + qInfo() << "Using multi-path prefix query for content search optimization:" << searchPathsList; + mainQuery = finalQuery; } } diff --git a/src/dfm-search/dfm-search-lib/filenamesearch/filenamestrategies/indexedstrategy.cpp b/src/dfm-search/dfm-search-lib/filenamesearch/filenamestrategies/indexedstrategy.cpp index bb8c606d..44c955c2 100644 --- a/src/dfm-search/dfm-search-lib/filenamesearch/filenamestrategies/indexedstrategy.cpp +++ b/src/dfm-search/dfm-search-lib/filenamesearch/filenamestrategies/indexedstrategy.cpp @@ -782,21 +782,12 @@ Lucene::QueryPtr FileNameIndexedStrategy::buildLuceneQuery(const IndexQuery &que // Add path prefix query optimization QStringList searchPathsList = m_options.searchPaths(); if (hasValidQuery && SearchUtility::isFilenameIndexAncestorPathsSupported()) { - bool usePrefixQuery = false; - for (const QString &p : searchPathsList) { - if (SearchUtility::shouldUsePathPrefixQuery(p)) { - usePrefixQuery = true; - break; - } - } - if (usePrefixQuery) { - QueryPtr pathPrefixQuery = LuceneQueryUtils::buildMultiPathPrefixQuery( - searchPathsList, - QString::fromWCharArray(LuceneFieldNames::FileName::kAncestorPaths)); - if (pathPrefixQuery) { - finalQuery->add(pathPrefixQuery, BooleanClause::MUST); - qInfo() << "Using multi-path prefix query for optimization:" << searchPathsList; - } + QueryPtr pathPrefixQuery = LuceneQueryUtils::buildMultiPathPrefixQuery( + searchPathsList, + QString::fromWCharArray(LuceneFieldNames::FileName::kAncestorPaths)); + if (pathPrefixQuery) { + finalQuery->add(pathPrefixQuery, BooleanClause::MUST); + qInfo() << "Using multi-path prefix query for optimization:" << searchPathsList; } } diff --git a/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.cpp b/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.cpp index 9c0e6ad5..da9aceda 100644 --- a/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.cpp +++ b/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.cpp @@ -100,24 +100,15 @@ Lucene::QueryPtr OcrTextIndexedStrategy::buildLuceneQuery(const SearchQuery &que // Add path prefix query optimization QStringList searchPathsList = m_options.searchPaths(); if (mainQuery && SearchUtility::isOcrTextIndexAncestorPathsSupported()) { - bool usePrefixQuery = false; - for (const QString &p : searchPathsList) { - if (SearchUtility::shouldUsePathPrefixQuery(p)) { - usePrefixQuery = true; - break; - } - } - if (usePrefixQuery) { - QueryPtr pathPrefixQuery = LuceneQueryUtils::buildMultiPathPrefixQuery( - searchPathsList, - QString::fromWCharArray(LuceneFieldNames::OcrText::kAncestorPaths)); - if (pathPrefixQuery) { - BooleanQueryPtr finalQuery = newLucene(); - finalQuery->add(mainQuery, BooleanClause::MUST); - finalQuery->add(pathPrefixQuery, BooleanClause::MUST); - qInfo() << "Using multi-path prefix query for OCR text search optimization:" << searchPathsList; - mainQuery = finalQuery; - } + QueryPtr pathPrefixQuery = LuceneQueryUtils::buildMultiPathPrefixQuery( + searchPathsList, + QString::fromWCharArray(LuceneFieldNames::OcrText::kAncestorPaths)); + if (pathPrefixQuery) { + BooleanQueryPtr finalQuery = newLucene(); + finalQuery->add(mainQuery, BooleanClause::MUST); + finalQuery->add(pathPrefixQuery, BooleanClause::MUST); + qInfo() << "Using multi-path prefix query for OCR text search optimization:" << searchPathsList; + mainQuery = finalQuery; } } @@ -398,7 +389,7 @@ void OcrTextIndexedStrategy::processSearchResults(const Lucene::IndexSearcherPtr resultApi.setOcrContent(content); // 设置高亮内容 const QString highlightedContent = ContentHighlighter::customHighlight( - m_keywords, content, previewLen, enableHTML); + m_keywords, content, previewLen, enableHTML); resultApi.setHighlightedContent(highlightedContent); } } catch (const Lucene::LuceneException &e) { diff --git a/src/dfm-search/dfm-search-lib/semantic/semanticsearcher.cpp b/src/dfm-search/dfm-search-lib/semantic/semanticsearcher.cpp index e02d46d4..aa903b39 100644 --- a/src/dfm-search/dfm-search-lib/semantic/semanticsearcher.cpp +++ b/src/dfm-search/dfm-search-lib/semantic/semanticsearcher.cpp @@ -68,21 +68,15 @@ void SemanticSearcherData::doSearch(const QString &naturalLanguage) : plan.searchDirectories; // Step 5: Set up signal/slot handlers - auto onResultsFound = [this](const SearchResultList &results) { - SearchResultList newResults; + auto onFinished = [this](const SearchResultList &results) { + // Collect and deduplicate results from each engine's final result list for (const SearchResult &r : results) { if (!seenPaths.contains(r.path())) { seenPaths.insert(r.path()); - newResults.append(r); + allResults.append(r); } } - if (!newResults.isEmpty()) { - allResults.append(newResults); - Q_EMIT q->resultsFound(newResults); - } - }; - auto onFinished = [this](const SearchResultList &) { if (pendingFinishCount.fetch_sub(1) == 1) { // All engines finished timeoutTimer->stop(); @@ -121,6 +115,13 @@ void SemanticSearcherData::doSearch(const QString &naturalLanguage) return opts; }; + // Apply caller-level options + auto applyCallerOptions = [this](SearchOptions &opts) { + if (detailedResultsEnabled) { + opts.setDetailedResultsEnabled(true); + } + }; + // Step 8: Launch up to 3 engines (FileName, Content, OCR) // TimeField::Both is no longer expanded here; it is handled by the Lucene strategy layer. // Multiple directories are passed via setSearchPaths(). @@ -128,22 +129,25 @@ void SemanticSearcherData::doSearch(const QString &naturalLanguage) // File name search (always, if index is ready) if (Global::isFileNameIndexReadyForSearch()) { SearchOptions fnameOpts = prepareOptions(plan.fileNameOptions); + applyCallerOptions(fnameOpts); createAndLaunchEngine(SearchType::FileName, plan.fileNameQuery, - fnameOpts, onResultsFound, onFinished, onError); + fnameOpts, onFinished, onError); } // Content search if (plan.contentQuery.has_value() && plan.contentOptions.has_value()) { SearchOptions contentOpts = prepareOptions(*plan.contentOptions); + applyCallerOptions(contentOpts); createAndLaunchEngine(SearchType::Content, *plan.contentQuery, - contentOpts, onResultsFound, onFinished, onError); + contentOpts, onFinished, onError); } // OCR search if (plan.ocrQuery.has_value() && plan.ocrOptions.has_value()) { SearchOptions ocrOpts = prepareOptions(*plan.ocrOptions); + applyCallerOptions(ocrOpts); createAndLaunchEngine(SearchType::Ocr, *plan.ocrQuery, - ocrOpts, onResultsFound, onFinished, onError); + ocrOpts, onFinished, onError); } // Step 9: Handle no-engine case @@ -163,14 +167,12 @@ void SemanticSearcherData::createAndLaunchEngine( SearchType type, const SearchQuery &query, const SearchOptions &options, - std::function onResultsFound, std::function onFinished, std::function onError) { SearchEngine *engine = SearchEngine::create(type, q); engine->setSearchOptions(options); - QObject::connect(engine, &SearchEngine::resultsFound, q, onResultsFound); QObject::connect(engine, &SearchEngine::searchFinished, q, onFinished); QObject::connect(engine, &SearchEngine::errorOccurred, q, onError); @@ -244,6 +246,16 @@ void SemanticSearcher::cancel() d_ptr->doCancel(); } +void SemanticSearcher::setDetailedResultsEnabled(bool enable) +{ + d_ptr->detailedResultsEnabled = enable; +} + +bool SemanticSearcher::isDetailedResultsEnabled() const +{ + return d_ptr->detailedResultsEnabled; +} + SearchResultExpected SemanticSearcher::searchSync(const QString &naturalLanguage) { if (d_ptr->status.load() == SearchStatus::Searching) { diff --git a/src/dfm-search/dfm-search-lib/semantic/semanticsearcher_p.h b/src/dfm-search/dfm-search-lib/semantic/semanticsearcher_p.h index ef1d2801..8deb1ee0 100644 --- a/src/dfm-search/dfm-search-lib/semantic/semanticsearcher_p.h +++ b/src/dfm-search/dfm-search-lib/semantic/semanticsearcher_p.h @@ -42,14 +42,12 @@ class SemanticSearcherData * @param type The search engine type (FileName, Content, or Ocr) * @param query The search query to execute * @param options The search options (including multi-path and time filter) - * @param onResultsFound Callback for result aggregation - * @param onFinished Callback for engine completion tracking + * @param onFinished Callback for engine completion and result collection * @param onError Callback for error handling */ void createAndLaunchEngine(SearchType type, const SearchQuery &query, const SearchOptions &options, - std::function onResultsFound, std::function onFinished, std::function onError); @@ -75,6 +73,9 @@ class SemanticSearcherData // Timeout QTimer *timeoutTimer = nullptr; + + // Options forwarded from caller + bool detailedResultsEnabled = false; }; DFM_SEARCH_END_NS diff --git a/src/dfm-search/dfm-search-lib/utils/searchutility.cpp b/src/dfm-search/dfm-search-lib/utils/searchutility.cpp index 19303e47..68186e25 100644 --- a/src/dfm-search/dfm-search-lib/utils/searchutility.cpp +++ b/src/dfm-search/dfm-search-lib/utils/searchutility.cpp @@ -876,27 +876,5 @@ QStringList deepinAnythingFileTypes() return kTypes; } -bool shouldUsePathPrefixQuery(const QString &searchPath) -{ - // Don't use path prefix query for root directory - if (searchPath == "/" || searchPath.isEmpty()) { - return false; - } - - // Check if it's one of the default indexed directories - const QStringList &defaultDirs = Global::defaultIndexedDirectory(); - for (const QString &defaultDir : defaultDirs) { - QString normalizedDefault = QDir::cleanPath(defaultDir); - QString normalizedSearch = QDir::cleanPath(searchPath); - - // Don't use path prefix query if search path is one of the default indexed directories - if (normalizedSearch == normalizedDefault) { - return false; - } - } - - return true; -} - } // namespace SearchUtility DFM_SEARCH_END_NS diff --git a/src/dfm-search/dfm-search-lib/utils/searchutility.h b/src/dfm-search/dfm-search-lib/utils/searchutility.h index 1fa74fd8..19d177bc 100644 --- a/src/dfm-search/dfm-search-lib/utils/searchutility.h +++ b/src/dfm-search/dfm-search-lib/utils/searchutility.h @@ -33,13 +33,6 @@ QStringList extractBooleanKeywords(const SearchQuery &query); */ QStringList deepinAnythingFileTypes(); -/** - * @brief Check if path prefix query optimization should be used - * @param searchPath The search path - * @return true if path prefix query should be used, false otherwise - */ -bool shouldUsePathPrefixQuery(const QString &searchPath); - /** * @brief Check if the filename index supports the ancestor_paths field. * This function checks the filename index version and returns true if the version is greater than 3. From 648f64c91c61518d54687ab30671ef20affe8329 Mon Sep 17 00:00:00 2001 From: Zhang Sheng Date: Mon, 18 May 2026 09:33:17 +0800 Subject: [PATCH 15/36] feat: enhance semantic search with explicit directories MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Added new search() and searchSync() overloads that accept explicit search directories parameter 2. Implemented search directory priority: explicit > NLP-parsed > home directory 3. Added intentParsed() signal to emit NLP parsing results before search starts 4. Extended JsonOutput to serialize ParsedIntent to JSON 5. Updated CLI client to support search path in semantic mode 6. Improved semantic search infrastructure with better directory handling Log: Added support for explicit search directories in semantic search Log: Added intentParsed signal showing NLP parsing results Influence: 1. Test semantic search with explicit directories vs natural language- parsed 2. Verify JSON output contains ParsedIntent details when available 3. Check search directory priority handling 4. Test intentParsed signal timing relative to searchStarted 5. Verify backward compatibility with single-parameter search() calls 6. Test JSON output format in both streaming and complete modes 7. Validate CLI behavior with and without specified search paths feat: 增强语义搜索功能,支持显式搜索目录 1. 新增支持显式搜索目录参数的 search() 和 searchSync() 方法重载 2. 实现搜索目录优先级:显式指定 > NLP解析 > 家目录 3. 添加 intentParsed() 信号,在搜索开始前发送NLP解析结果 4. 扩展 JsonOutput 以支持 ParsedIntent 的JSON序列化 5. 更新命令行客户端支持语义模式下的搜索路径 6. 改进语义搜索基础设施,提供更好的目录处理能力 Log: 新增语义搜索中显式搜索目录支持 Log: 添加显示NLP解析结果的intentParsed信号 Influence: 1. 测试显式目录与自然语言解析目录的语义搜索 2. 验证JSON输出是否包含可用的ParsedIntent详情 3. 检查搜索目录优先级处理是否正确 4. 测试intentParsed信号与searchStarted信号的时序关系 5. 验证单参数search()调用的向后兼容性 6. 测试流式输出和完整输出模式下的JSON格式 7. 验证带路径和不带路径情况下CLI的行为 --- .../dfm-search/dfm-search/semanticsearcher.h | 31 +++++ src/dfm-search/dfm-search-client/main.cpp | 12 +- .../dfm-search-client/output/json_output.cpp | 129 ++++++++++++++++++ .../dfm-search-client/output/json_output.h | 13 ++ .../semantic/semanticsearcher.cpp | 40 ++++-- .../semantic/semanticsearcher_p.h | 2 +- 6 files changed, 217 insertions(+), 10 deletions(-) diff --git a/include/dfm-search/dfm-search/semanticsearcher.h b/include/dfm-search/dfm-search/semanticsearcher.h index ac39df6e..943f19f6 100644 --- a/include/dfm-search/dfm-search/semanticsearcher.h +++ b/include/dfm-search/dfm-search/semanticsearcher.h @@ -10,6 +10,7 @@ #include #include #include +#include DFM_SEARCH_BEGIN_NS @@ -71,6 +72,18 @@ class SemanticSearcher : public QObject */ void search(const QString &naturalLanguage); + /** + * @brief Perform a semantic search with explicit search directories + * + * When @p searchDirectories is non-empty, those directories take priority + * over any directories resolved from the natural language input. + * If empty, falls back to NLP-parsed directories, then home directory. + * + * @param naturalLanguage The natural language query string + * @param searchDirectories Explicit directories to search in + */ + void search(const QString &naturalLanguage, const QStringList &searchDirectories); + /** * @brief Check if the input contains semantic intent beyond a plain keyword. * @@ -95,6 +108,14 @@ class SemanticSearcher : public QObject */ SearchResultExpected searchSync(const QString &naturalLanguage); + /** + * @brief Perform a synchronous semantic search with explicit directories + * @param naturalLanguage The natural language query string + * @param searchDirectories Explicit directories to search in + * @return SearchResultExpected containing deduplicated results or an error + */ + SearchResultExpected searchSync(const QString &naturalLanguage, const QStringList &searchDirectories); + /** * @brief Cancel the current search operation */ @@ -117,6 +138,16 @@ class SemanticSearcher : public QObject bool isDetailedResultsEnabled() const; Q_SIGNALS: + /** + * @brief Emitted after the natural language input is parsed into an intent + * + * This fires before searchStarted(), allowing callers to inspect + * what the NLP parser understood from the input. + * + * @param intent The parsed intent structure + */ + void intentParsed(const DFMSEARCH::ParsedIntent &intent); + /** * @brief Emitted when a search operation starts */ diff --git a/src/dfm-search/dfm-search-client/main.cpp b/src/dfm-search/dfm-search-client/main.cpp index c363af7b..c077befd 100644 --- a/src/dfm-search/dfm-search-client/main.cpp +++ b/src/dfm-search/dfm-search-client/main.cpp @@ -189,6 +189,12 @@ int main(int argc, char *argv[]) SearchType::Semantic, SearchMethod::Indexed); QObject::connect(formatter, &OutputFormatter::finished, &app, &QCoreApplication::quit); + QObject::connect(semanticSearcher, &DFMSEARCH::SemanticSearcher::intentParsed, + [formatter](const DFMSEARCH::ParsedIntent &intent) { + if (auto *jsonOut = qobject_cast(formatter)) { + jsonOut->setParsedIntent(intent); + } + }); QObject::connect(semanticSearcher, &DFMSEARCH::SemanticSearcher::searchStarted, [formatter]() { formatter->outputSearchStarted(); }); @@ -202,7 +208,11 @@ int main(int argc, char *argv[]) formatter->outputError(error); }); - semanticSearcher->search(config.keyword); + QStringList semanticDirs; + if (!config.searchPath.isEmpty()) { + semanticDirs = QStringList { config.searchPath }; + } + semanticSearcher->search(config.keyword, semanticDirs); return app.exec(); } diff --git a/src/dfm-search/dfm-search-client/output/json_output.cpp b/src/dfm-search/dfm-search-client/output/json_output.cpp index 2b3e6482..e2979940 100644 --- a/src/dfm-search/dfm-search-client/output/json_output.cpp +++ b/src/dfm-search/dfm-search-client/output/json_output.cpp @@ -22,6 +22,125 @@ void JsonOutput::setSearchContext(const QString &keyword, const QString &searchP m_searchMethod = searchMethod; } +QJsonObject JsonOutput::intentToJson(const DFMSEARCH::ParsedIntent &intent) +{ + QJsonObject obj; + + // timeConstraint + if (intent.timeConstraint.isValid()) { + QJsonObject time; + QString kindStr; + switch (intent.timeConstraint.kind) { + case DFMSEARCH::TimeConstraintKind::Preset: + kindStr = "preset"; + { + QString presetStr; + switch (intent.timeConstraint.preset) { + case DFMSEARCH::TimePreset::Today: presetStr = "today"; break; + case DFMSEARCH::TimePreset::Yesterday: presetStr = "yesterday"; break; + case DFMSEARCH::TimePreset::DayBeforeYesterday: presetStr = "dayBeforeYesterday"; break; + case DFMSEARCH::TimePreset::ThisWeek: presetStr = "thisWeek"; break; + case DFMSEARCH::TimePreset::LastWeek: presetStr = "lastWeek"; break; + case DFMSEARCH::TimePreset::ThisMonth: presetStr = "thisMonth"; break; + case DFMSEARCH::TimePreset::LastMonth: presetStr = "lastMonth"; break; + case DFMSEARCH::TimePreset::ThisYear: presetStr = "thisYear"; break; + case DFMSEARCH::TimePreset::LastYear: presetStr = "lastYear"; break; + } + time["preset"] = presetStr; + } + break; + case DFMSEARCH::TimeConstraintKind::Relative: + kindStr = "relative"; + time["value"] = intent.timeConstraint.relativeValue; + { + QString unitStr; + switch (intent.timeConstraint.relativeUnit) { + case DFMSEARCH::TimeUnit::Minutes: unitStr = "minutes"; break; + case DFMSEARCH::TimeUnit::Hours: unitStr = "hours"; break; + case DFMSEARCH::TimeUnit::Days: unitStr = "days"; break; + case DFMSEARCH::TimeUnit::Weeks: unitStr = "weeks"; break; + case DFMSEARCH::TimeUnit::Months: unitStr = "months"; break; + case DFMSEARCH::TimeUnit::Years: unitStr = "years"; break; + } + time["unit"] = unitStr; + } + break; + case DFMSEARCH::TimeConstraintKind::Custom: + kindStr = "custom"; + if (intent.timeConstraint.customStart.isValid()) + time["start"] = intent.timeConstraint.customStart.toString(Qt::ISODate); + if (intent.timeConstraint.customEnd.isValid()) + time["end"] = intent.timeConstraint.customEnd.toString(Qt::ISODate); + break; + case DFMSEARCH::TimeConstraintKind::None: + break; + } + time["kind"] = kindStr; + + if (intent.timeConstraint.timeField != DFMSEARCH::TimeField::Unspecified) { + QString fieldStr; + switch (intent.timeConstraint.timeField) { + case DFMSEARCH::TimeField::ModifyTime: fieldStr = "modifyTime"; break; + case DFMSEARCH::TimeField::BirthTime: fieldStr = "birthTime"; break; + case DFMSEARCH::TimeField::Both: fieldStr = "both"; break; + default: fieldStr = "unspecified"; break; + } + time["timeField"] = fieldStr; + } + + obj["timeConstraint"] = time; + } + + // sizeConstraint + if (intent.sizeConstraint.isValid()) { + QJsonObject size; + if (intent.sizeConstraint.minSize > 0) + size["minBytes"] = intent.sizeConstraint.minSize; + if (intent.sizeConstraint.maxSize > 0) + size["maxBytes"] = intent.sizeConstraint.maxSize; + size["includeLower"] = intent.sizeConstraint.includeLower; + size["includeUpper"] = intent.sizeConstraint.includeUpper; + obj["sizeConstraint"] = size; + } + + // fileExtensions + if (!intent.fileExtensions.isEmpty()) { + obj["fileExtensions"] = QJsonArray::fromStringList(intent.fileExtensions); + } + + // searchDirectories (NLP-parsed, before caller override is applied) + if (!intent.searchDirectories.isEmpty()) { + obj["searchDirectories"] = QJsonArray::fromStringList(intent.searchDirectories); + } + + // keywords + if (!intent.keywords.isEmpty()) { + obj["keywords"] = QJsonArray::fromStringList(intent.keywords); + } + + // includeHidden + obj["includeHidden"] = intent.includeHidden; + + // consumedSpans + if (!intent.consumedSpans.isEmpty()) { + QJsonArray spans; + for (const auto &span : intent.consumedSpans) { + if (span.isValid()) { + QJsonObject s; + s["start"] = span.start; + s["end"] = span.end; + s["ruleId"] = span.ruleId; + spans.append(s); + } + } + if (!spans.isEmpty()) { + obj["consumedSpans"] = spans; + } + } + + return obj; +} + QJsonValue JsonOutput::resultToJson(const SearchResult &result) { // 如果不启用详细结果,只返回路径 @@ -260,6 +379,11 @@ void JsonOutput::outputStreamingStart() searchInfo["sizeRangeFilter"] = sizeFilterInfo; } + // 语义搜索:附加 ParsedIntent + if (m_searchType == SearchType::Semantic && m_parsedIntent.has_value()) { + searchInfo["intent"] = intentToJson(*m_parsedIntent); + } + startObj["search"] = searchInfo; startObj["timestamp"] = QDateTime::currentDateTime().toString(Qt::ISODate); @@ -371,6 +495,11 @@ void JsonOutput::outputCompleteResult(const QList &results) searchInfo["sizeRangeFilter"] = sizeFilterInfo; } + // 语义搜索:附加 ParsedIntent + if (m_searchType == SearchType::Semantic && m_parsedIntent.has_value()) { + searchInfo["intent"] = intentToJson(*m_parsedIntent); + } + root["search"] = searchInfo; // 时间戳 diff --git a/src/dfm-search/dfm-search-client/output/json_output.h b/src/dfm-search/dfm-search-client/output/json_output.h index 69d4db2f..a8c14d96 100644 --- a/src/dfm-search/dfm-search-client/output/json_output.h +++ b/src/dfm-search/dfm-search-client/output/json_output.h @@ -7,9 +7,11 @@ #include "output_formatter.h" #include +#include #include #include +#include namespace dfmsearch { @@ -40,6 +42,13 @@ class JsonOutput : public OutputFormatter */ void setSearchOptions(const SearchOptions &options) { m_options = options; } + /** + * @brief 设置语义搜索解析的 ParsedIntent + * + * 在 searchType 为 Semantic 时,intent 信息会序列化到 JSON 输出中 + */ + void setParsedIntent(const DFMSEARCH::ParsedIntent &intent) { m_parsedIntent = intent; } + /** * @brief 设置是否启用详细输出模式 * @param verbose true 启用详细输出 @@ -50,6 +59,9 @@ class JsonOutput : public OutputFormatter QJsonValue resultToJson(const SearchResult &result); void printJsonLine(const QJsonObject &obj); + // Intent 序列化辅助 + QJsonObject intentToJson(const DFMSEARCH::ParsedIntent &intent); + // 流式输出方法 void outputStreamingStart(); void outputStreamingResult(const SearchResult &result); @@ -69,6 +81,7 @@ class JsonOutput : public OutputFormatter bool m_streaming; bool m_verbose = false; QJsonArray m_collectedResults; + std::optional m_parsedIntent; }; } // namespace dfmsearch diff --git a/src/dfm-search/dfm-search-lib/semantic/semanticsearcher.cpp b/src/dfm-search/dfm-search-lib/semantic/semanticsearcher.cpp index aa903b39..9b9ae54d 100644 --- a/src/dfm-search/dfm-search-lib/semantic/semanticsearcher.cpp +++ b/src/dfm-search/dfm-search-lib/semantic/semanticsearcher.cpp @@ -40,7 +40,7 @@ SemanticSearcherData::~SemanticSearcherData() doCancel(); } -void SemanticSearcherData::doSearch(const QString &naturalLanguage) +void SemanticSearcherData::doSearch(const QString &naturalLanguage, const QStringList &searchDirectories) { if (naturalLanguage.trimmed().isEmpty()) { Q_EMIT q->errorOccurred(SearchError(SearchErrorCode::InvalidQuery)); @@ -53,19 +53,28 @@ void SemanticSearcherData::doSearch(const QString &naturalLanguage) seenPaths.clear(); status.store(SearchStatus::Searching); Q_EMIT q->statusChanged(SearchStatus::Searching); - Q_EMIT q->searchStarted(); - // Step 2: Parse natural language into intent + // Step 2: Parse natural language into intent (before searchStarted + // so that intentParsed listeners have the data when searchStarted fires) ParsedIntent intent; intentParser->parse(naturalLanguage, intent); + Q_EMIT q->intentParsed(intent); + + Q_EMIT q->searchStarted(); // Step 3: Build search plan const SemanticSearchPlan plan = queryBuilder->build(intent); // Step 4: Determine search directories - QStringList dirs = plan.searchDirectories.isEmpty() - ? QStringList { QDir::homePath() } - : plan.searchDirectories; + // Priority: caller-specified directories > NLP-parsed directories > home directory + QStringList dirs; + if (!searchDirectories.isEmpty()) { + dirs = searchDirectories; + } else if (!plan.searchDirectories.isEmpty()) { + dirs = plan.searchDirectories; + } else { + dirs = QStringList { QDir::homePath() }; + } // Step 5: Set up signal/slot handlers auto onFinished = [this](const SearchResultList &results) { @@ -223,7 +232,17 @@ void SemanticSearcher::search(const QString &naturalLanguage) return; } - d_ptr->doSearch(naturalLanguage); + d_ptr->doSearch(naturalLanguage, {}); +} + +void SemanticSearcher::search(const QString &naturalLanguage, const QStringList &searchDirectories) +{ + if (d_ptr->status.load() == SearchStatus::Searching) { + qWarning() << "Search already in progress"; + return; + } + + d_ptr->doSearch(naturalLanguage, searchDirectories); } bool SemanticSearcher::isSemanticQuery(const QString &input) const @@ -257,6 +276,11 @@ bool SemanticSearcher::isDetailedResultsEnabled() const } SearchResultExpected SemanticSearcher::searchSync(const QString &naturalLanguage) +{ + return searchSync(naturalLanguage, {}); +} + +SearchResultExpected SemanticSearcher::searchSync(const QString &naturalLanguage, const QStringList &searchDirectories) { if (d_ptr->status.load() == SearchStatus::Searching) { qWarning() << "Search already in progress"; @@ -306,7 +330,7 @@ SearchResultExpected SemanticSearcher::searchSync(const QString &naturalLanguage }); // Start the async search (uses internal timeout mechanism) - d_ptr->doSearch(naturalLanguage); + d_ptr->doSearch(naturalLanguage, searchDirectories); // Block until completion, cancellation, or error eventLoop.exec(); diff --git a/src/dfm-search/dfm-search-lib/semantic/semanticsearcher_p.h b/src/dfm-search/dfm-search-lib/semantic/semanticsearcher_p.h index 8deb1ee0..8b792e9a 100644 --- a/src/dfm-search/dfm-search-lib/semantic/semanticsearcher_p.h +++ b/src/dfm-search/dfm-search-lib/semantic/semanticsearcher_p.h @@ -29,7 +29,7 @@ class SemanticSearcherData explicit SemanticSearcherData(SemanticSearcher *q); ~SemanticSearcherData(); - void doSearch(const QString &naturalLanguage); + void doSearch(const QString &naturalLanguage, const QStringList &searchDirectories); void doCancel(); /** From 64dc847e7697538680524a497c80d7d1082c890a Mon Sep 17 00:00:00 2001 From: Zhang Sheng Date: Mon, 18 May 2026 10:09:14 +0800 Subject: [PATCH 16/36] feat: add max results limit for semantic search MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Added max results parameter to control search result volume 2. Implemented result truncation after deduplication in semantic search 3. Added CLI option to set max results from command line 4. Results are now limited in both individual engines and final output 5. Search options are properly forwarded to all sub-engines Log: Added ability to limit maximum search results in semantic search Influence: 1. Test search with max results set to various values (0, 10, 1000) 2. Verify results are properly truncated while maintaining deduplication 3. Check command line option --max-results functionality 4. Test interaction with detailed results mode 5. Verify engine-level and final-level result limiting feat: 为语义搜索添加最大结果数限制 1. 添加最大结果数参数以控制搜索结果数量 2. 在语义搜索中去重后实现结果截断 3. 添加命令行选项设置最大结果数 4. 单个引擎和最终输出结果都受到限制 5. 搜索选项正确转发到所有子引擎 Log: 新增语义搜索结果数量限制功能 Influence: 1. 测试使用不同最大结果数(0, 10, 1000)的搜索 2. 验证结果在去重后正确截断 3. 检查命令行选项--max-results的功能 4. 测试与详细结果模式的交互 5. 验证引擎级别和最终级别的结果限制 --- .../dfm-search/dfm-search/semanticsearcher.h | 17 +++++++++++++++++ .../dfm-search-client/cli_options.cpp | 7 +++++++ src/dfm-search/dfm-search-client/main.cpp | 3 +++ .../semantic/semanticsearcher.cpp | 19 +++++++++++++++++++ .../semantic/semanticsearcher_p.h | 1 + 5 files changed, 47 insertions(+) diff --git a/include/dfm-search/dfm-search/semanticsearcher.h b/include/dfm-search/dfm-search/semanticsearcher.h index 943f19f6..10e81a8e 100644 --- a/include/dfm-search/dfm-search/semanticsearcher.h +++ b/include/dfm-search/dfm-search/semanticsearcher.h @@ -137,6 +137,23 @@ class SemanticSearcher : public QObject */ bool isDetailedResultsEnabled() const; + /** + * @brief Set the maximum number of results to return + * + * Each sub-engine (FileName, Content, OCR) will be limited to this count. + * After all engines finish, results are deduplicated and truncated + * to this count. + * + * @param count Maximum result count (0 = unlimited, default 0) + */ + void setMaxResults(int count); + + /** + * @brief Get the maximum number of results + * @return Maximum result count (0 = unlimited) + */ + int maxResults() const; + Q_SIGNALS: /** * @brief Emitted after the natural language input is parsed into an intent diff --git a/src/dfm-search/dfm-search-client/cli_options.cpp b/src/dfm-search/dfm-search-client/cli_options.cpp index 3c19b388..38f9e8f7 100644 --- a/src/dfm-search/dfm-search-client/cli_options.cpp +++ b/src/dfm-search/dfm-search-client/cli_options.cpp @@ -228,6 +228,13 @@ bool CliOptions::parse(QCoreApplication &app, SearchCliConfig &config) config.maxPreviewLength = previewLength; } } + if (m_parser.isSet(m_maxResultsOption)) { + bool ok; + int maxResults = m_parser.value(m_maxResultsOption).toInt(&ok); + if (ok && maxResults >= 0) { + config.maxResults = maxResults; + } + } return true; } diff --git a/src/dfm-search/dfm-search-client/main.cpp b/src/dfm-search/dfm-search-client/main.cpp index c077befd..f5335c43 100644 --- a/src/dfm-search/dfm-search-client/main.cpp +++ b/src/dfm-search/dfm-search-client/main.cpp @@ -166,6 +166,9 @@ int main(int argc, char *argv[]) if (config.semanticMode) { auto *semanticSearcher = new DFMSEARCH::SemanticSearcher(&app); semanticSearcher->setDetailedResultsEnabled(config.verbose); + if (config.maxResults > 0) { + semanticSearcher->setMaxResults(config.maxResults); + } OutputFormatter *formatter = createOutputFormatter(config, &app); diff --git a/src/dfm-search/dfm-search-lib/semantic/semanticsearcher.cpp b/src/dfm-search/dfm-search-lib/semantic/semanticsearcher.cpp index 9b9ae54d..8f9df560 100644 --- a/src/dfm-search/dfm-search-lib/semantic/semanticsearcher.cpp +++ b/src/dfm-search/dfm-search-lib/semantic/semanticsearcher.cpp @@ -89,6 +89,12 @@ void SemanticSearcherData::doSearch(const QString &naturalLanguage, const QStrin if (pendingFinishCount.fetch_sub(1) == 1) { // All engines finished timeoutTimer->stop(); + + // Truncate final deduplicated results to maxResults + if (maxResults > 0 && allResults.size() > maxResults) { + allResults = allResults.mid(0, maxResults); + } + if (cancelled.load()) { status.store(SearchStatus::Cancelled); Q_EMIT q->statusChanged(SearchStatus::Cancelled); @@ -129,6 +135,9 @@ void SemanticSearcherData::doSearch(const QString &naturalLanguage, const QStrin if (detailedResultsEnabled) { opts.setDetailedResultsEnabled(true); } + if (maxResults > 0) { + opts.setMaxResults(maxResults); + } }; // Step 8: Launch up to 3 engines (FileName, Content, OCR) @@ -275,6 +284,16 @@ bool SemanticSearcher::isDetailedResultsEnabled() const return d_ptr->detailedResultsEnabled; } +void SemanticSearcher::setMaxResults(int count) +{ + d_ptr->maxResults = count; +} + +int SemanticSearcher::maxResults() const +{ + return d_ptr->maxResults; +} + SearchResultExpected SemanticSearcher::searchSync(const QString &naturalLanguage) { return searchSync(naturalLanguage, {}); diff --git a/src/dfm-search/dfm-search-lib/semantic/semanticsearcher_p.h b/src/dfm-search/dfm-search-lib/semantic/semanticsearcher_p.h index 8b792e9a..dd146ff2 100644 --- a/src/dfm-search/dfm-search-lib/semantic/semanticsearcher_p.h +++ b/src/dfm-search/dfm-search-lib/semantic/semanticsearcher_p.h @@ -76,6 +76,7 @@ class SemanticSearcherData // Options forwarded from caller bool detailedResultsEnabled = false; + int maxResults = 0; // 0 = unlimited }; DFM_SEARCH_END_NS From eb1079fd3b8c04456c55b9c3808344f5789b3e05 Mon Sep 17 00:00:00 2001 From: Zhang Sheng Date: Mon, 18 May 2026 11:55:03 +0800 Subject: [PATCH 17/36] test: add search target control tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Added SearchTarget enum to specify where to search (filename/content/ all) in semantic search 2. Implemented keyword extraction rule metadata to detect target from user query 3. Modified SemanticQueryBuilder to selectively enable search paths based on target 4. Added comprehensive test cases for: - Search target detection from user queries - Query builder behavior for different targets - Default fallback behavior 5. Updated Chinese keyword rules with search_target metadata Influence: 1. Test Chinese queries with filename/content keywords 2. Verify default fallback to all search paths 3. Check query builder produces correct search plans 4. Test boundary cases like empty keywords or invalid rules 5. Verify search target metadata handling in rules test: 添加搜索目标控制测试 1. 新增SearchTarget枚举类型用于指定语义搜索的范围(文件名/内容/全部) 2. 实现关键字提取规则元数据,从用户查询中检测搜索目标 3. 修改语义查询构建器,根据目标选择性启用搜索路径 4. 添加完整的测试用例包括: - 从用户查询检测搜索目标 - 查询构建器对不同目标的处理 - 默认回退行为测试 5. 更新中文关键字规则文件,添加search_target元数据 Influence: 1. 测试包含文件名/内容关键字的中文查询 2. 验证默认回退到全路径搜索的行为 3. 检查查询构建器生成的搜索计划是否正确 4. 测试边界情况如空关键字或无效规则 5. 验证规则文件中搜索目标元数据的处理 --- autotests/dfm-search-tests/main.cpp | 10 ++ .../dfm-search-tests/tst_semantic_search.cpp | 155 ++++++++++++++++++ .../dfm-search/dfm-search/semantic_types.h | 13 ++ .../semantic/extractors/keywordextractor.cpp | 9 + .../semantic/rules/zh_CN/keyword_rules.json | 19 ++- .../semantic/semanticquerybuilder.cpp | 19 ++- 6 files changed, 218 insertions(+), 7 deletions(-) diff --git a/autotests/dfm-search-tests/main.cpp b/autotests/dfm-search-tests/main.cpp index 88092b03..202a2ea2 100644 --- a/autotests/dfm-search-tests/main.cpp +++ b/autotests/dfm-search-tests/main.cpp @@ -17,6 +17,8 @@ extern QObject *create_tst_ParsedIntent(); extern QObject *create_tst_ChineseNLP(); extern QObject *create_tst_SizeRangeFilter(); extern QObject *create_tst_IsSemanticQuery(); +extern QObject *create_tst_SearchTarget(); +extern QObject *create_tst_SemanticQueryBuilderTarget(); int main(int argc, char *argv[]) { @@ -71,5 +73,13 @@ int main(int argc, char *argv[]) result |= QTest::qExec(testObj12, argc, argv); delete testObj12; + QObject *testObj13 = create_tst_SearchTarget(); + result |= QTest::qExec(testObj13, argc, argv); + delete testObj13; + + QObject *testObj14 = create_tst_SemanticQueryBuilderTarget(); + result |= QTest::qExec(testObj14, argc, argv); + delete testObj14; + return result; } diff --git a/autotests/dfm-search-tests/tst_semantic_search.cpp b/autotests/dfm-search-tests/tst_semantic_search.cpp index a9aa384f..86d6cf42 100644 --- a/autotests/dfm-search-tests/tst_semantic_search.cpp +++ b/autotests/dfm-search-tests/tst_semantic_search.cpp @@ -14,6 +14,8 @@ #include "semantic/semanticruleengine.h" #include "semantic/intentparser.h" #include "semantic/ruleconfigloader.h" +#include "semantic/extractors/keywordextractor.h" +#include "semantic/semanticquerybuilder.h" using namespace DFMSEARCH; @@ -896,6 +898,157 @@ void tst_IsSemanticQuery::noiseWordsOnly() QVERIFY(!checkIsSemanticQuery(m_engine, m_parser, "查找")); } +// ===== tst_SearchTarget ===== + +class tst_SearchTarget : public QObject +{ + Q_OBJECT + +private Q_SLOTS: + void initTestCase(); + void defaultIsAll(); + void filenameContains(); + void filenameNamed(); + void contentContains(); + void genericContainsStaysAll(); + void unconsumedTextStaysAll(); + +private: + SemanticRuleEngine *m_engine = nullptr; + KeywordExtractor *m_extractor = nullptr; +}; + +void tst_SearchTarget::initTestCase() +{ + if (!sourceRulesAvailable()) { + QSKIP("Rule files not found in source tree, skipping search target tests"); + } + + m_engine = new SemanticRuleEngine(this); + const QString dir = sourceRulesDir(); + const QStringList ruleFiles = QDir(dir).entryList( + {"*.json"}, QDir::Files, QDir::Name); + for (const QString &filename : ruleFiles) { + QString path = dir + "/" + filename; + if (!m_engine->loadRuleFile(path)) { + qWarning() << "Failed to load rule file:" << path; + } + } + + m_extractor = new KeywordExtractor(m_engine); +} + +void tst_SearchTarget::defaultIsAll() +{ + ParsedIntent intent; + m_extractor->extract("蓝天白云", intent); + QCOMPARE(intent.searchTarget, SearchTarget::All); +} + +void tst_SearchTarget::filenameContains() +{ + ParsedIntent intent; + m_extractor->extract("文件名包含测试的文档", intent); + QCOMPARE(intent.searchTarget, SearchTarget::FileNameOnly); + QCOMPARE(intent.keywords.size(), 1); + QCOMPARE(intent.keywords.first(), QString("测试")); +} + +void tst_SearchTarget::filenameNamed() +{ + ParsedIntent intent; + m_extractor->extract("名为报告的文件", intent); + QCOMPARE(intent.searchTarget, SearchTarget::FileNameOnly); + QCOMPARE(intent.keywords.size(), 1); + QCOMPARE(intent.keywords.first(), QString("报告")); +} + +void tst_SearchTarget::contentContains() +{ + ParsedIntent intent; + m_extractor->extract("文件内容包含配置的文档", intent); + QCOMPARE(intent.searchTarget, SearchTarget::ContentOnly); + QCOMPARE(intent.keywords.size(), 1); + QCOMPARE(intent.keywords.first(), QString("配置")); +} + +void tst_SearchTarget::genericContainsStaysAll() +{ + ParsedIntent intent; + m_extractor->extract("包含测试的文件", intent); + QCOMPARE(intent.searchTarget, SearchTarget::All); + QVERIFY(!intent.keywords.isEmpty()); +} + +void tst_SearchTarget::unconsumedTextStaysAll() +{ + // No structured keyword rule matches → unconsumed text extraction + ParsedIntent intent; + m_extractor->extract("项目计划书", intent); + QCOMPARE(intent.searchTarget, SearchTarget::All); + QVERIFY(!intent.keywords.isEmpty()); +} + +// ===== tst_SemanticQueryBuilderTarget ===== + +class tst_SemanticQueryBuilderTarget : public QObject +{ + Q_OBJECT + +private Q_SLOTS: + void defaultTarget(); + void fileNameOnlyTarget(); + void contentOnlyTarget(); + +private: + ParsedIntent makeIntent(const QStringList &keywords, SearchTarget target) const; +}; + +ParsedIntent tst_SemanticQueryBuilderTarget::makeIntent( + const QStringList &keywords, SearchTarget target) const +{ + ParsedIntent intent; + intent.keywords = keywords; + intent.searchTarget = target; + return intent; +} + +void tst_SemanticQueryBuilderTarget::defaultTarget() +{ + SemanticQueryBuilder builder; + ParsedIntent intent = makeIntent({"测试"}, SearchTarget::All); + SemanticSearchPlan plan = builder.build(intent); + + // All three paths should produce queries + QVERIFY(!plan.fileNameQuery.keyword().isEmpty()); + QVERIFY(plan.contentQuery.has_value()); + QVERIFY(plan.ocrQuery.has_value()); +} + +void tst_SemanticQueryBuilderTarget::fileNameOnlyTarget() +{ + SemanticQueryBuilder builder; + ParsedIntent intent = makeIntent({"测试"}, SearchTarget::FileNameOnly); + SemanticSearchPlan plan = builder.build(intent); + + // Only filename query should be built + QVERIFY(!plan.fileNameQuery.keyword().isEmpty()); + QVERIFY(!plan.contentQuery.has_value()); + QVERIFY(!plan.ocrQuery.has_value()); +} + +void tst_SemanticQueryBuilderTarget::contentOnlyTarget() +{ + SemanticQueryBuilder builder; + ParsedIntent intent = makeIntent({"测试"}, SearchTarget::ContentOnly); + SemanticSearchPlan plan = builder.build(intent); + + // Filename should NOT be built; content and ocr should + QVERIFY(plan.fileNameQuery.keyword().isEmpty()); + QVERIFY(plan.contentQuery.has_value()); + QVERIFY(plan.ocrQuery.has_value()); +} + // ===== Factory functions ===== QObject *create_tst_RuleEngine() { return new tst_RuleEngine(); } @@ -904,5 +1057,7 @@ QObject *create_tst_FileTypeExtraction() { return new tst_FileTypeExtraction(); QObject *create_tst_KeywordExtraction() { return new tst_KeywordExtraction(); } QObject *create_tst_ParsedIntent() { return new tst_ParsedIntent(); } QObject *create_tst_IsSemanticQuery() { return new tst_IsSemanticQuery(); } +QObject *create_tst_SearchTarget() { return new tst_SearchTarget(); } +QObject *create_tst_SemanticQueryBuilderTarget() { return new tst_SemanticQueryBuilderTarget(); } #include "tst_semantic_search.moc" diff --git a/include/dfm-search/dfm-search/semantic_types.h b/include/dfm-search/dfm-search/semantic_types.h index f68a5f70..520aea03 100644 --- a/include/dfm-search/dfm-search/semantic_types.h +++ b/include/dfm-search/dfm-search/semantic_types.h @@ -80,6 +80,18 @@ struct SizeConstraint bool isValid() const { return minSize > 0 || maxSize > 0; } }; +/** + * @brief Represents the target scope for a semantic search. + * + * When a user explicitly specifies where to search (e.g. "文件名包含XX" + * vs "文件内容包含XX"), this enum controls which search paths are enabled. + */ +enum class SearchTarget { + All, ///< All search paths enabled (default) + FileNameOnly, ///< Only filename search + ContentOnly ///< Only content + OCR search +}; + /** * @brief Represents the parsed intent from natural language input. * @@ -95,6 +107,7 @@ struct ParsedIntent QStringList searchDirectories; // Absolute paths resolved from location words bool includeHidden = false; // true for trash (hidden directory) QStringList keywords; + SearchTarget searchTarget = SearchTarget::All; QList consumedSpans; }; diff --git a/src/dfm-search/dfm-search-lib/semantic/extractors/keywordextractor.cpp b/src/dfm-search/dfm-search-lib/semantic/extractors/keywordextractor.cpp index 0feda2dd..35af0891 100644 --- a/src/dfm-search/dfm-search-lib/semantic/extractors/keywordextractor.cpp +++ b/src/dfm-search/dfm-search-lib/semantic/extractors/keywordextractor.cpp @@ -60,6 +60,15 @@ bool KeywordExtractor::extractStructuredKeywords(const QString &input, ParsedInt intent.keywords = { captured }; } + // Determine search target from rule metadata + const QString targetStr = metadata.value("search_target").toString(); + if (targetStr == "filename") { + intent.searchTarget = SearchTarget::FileNameOnly; + } else if (targetStr == "content") { + intent.searchTarget = SearchTarget::ContentOnly; + } + // "all" or empty → keep default SearchTarget::All + // Mark the entire matched region as consumed MatchSpan span; span.start = match.capturedStart(); diff --git a/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/keyword_rules.json b/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/keyword_rules.json index 7fb9ae64..6321765e 100644 --- a/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/keyword_rules.json +++ b/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/keyword_rules.json @@ -15,6 +15,7 @@ "metadata": { "capture_group": 1, "multi_keyword": true, + "search_target": "all", "cleanup_pattern": "[\\s。、\u201c\u201d\uff0c\u3001\u7684\u4e86\u8981\u554a]+", "split_pattern": "[\u548c\u4e0e\u4ee5\u53ca\u6216\u3001]+" } @@ -28,6 +29,7 @@ "metadata": { "capture_group": 1, "multi_keyword": false, + "search_target": "filename", "cleanup_pattern": "[\\s。、\u201c\u201d\uff0c\u3001\u7684\u4e86\u8981\u554a]+", "split_pattern": "[\u548c\u4e0e\u4ee5\u53ca\u6216\u3001]+" } @@ -37,10 +39,25 @@ "pattern": "内容(?:包含|含有|带有)(.+?)(?:的|$)", "description": "Content contains keyword pattern", "enabled": true, - "priority": 200, + "priority": 210, + "metadata": { + "capture_group": 1, + "multi_keyword": true, + "search_target": "content", + "cleanup_pattern": "[\\s。、\u201c\u201d\uff0c\u3001\u7684\u4e86\u8981\u554a]+", + "split_pattern": "[\u548c\u4e0e\u4ee5\u53ca\u6216\u3001]+" + } + }, + { + "id": "keyword_filename_has", + "pattern": "文件名(?:包含|含有|带有|是|为)(.+?)(?:的|$)", + "description": "Filename contains keyword pattern", + "enabled": true, + "priority": 210, "metadata": { "capture_group": 1, "multi_keyword": true, + "search_target": "filename", "cleanup_pattern": "[\\s。、\u201c\u201d\uff0c\u3001\u7684\u4e86\u8981\u554a]+", "split_pattern": "[\u548c\u4e0e\u4ee5\u53ca\u6216\u3001]+" } diff --git a/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.cpp b/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.cpp index 0e817b10..1dc03b91 100644 --- a/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.cpp +++ b/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.cpp @@ -39,8 +39,15 @@ SemanticSearchPlan SemanticQueryBuilder::build(const ParsedIntent &intent) SearchOptions baseOpts = buildBaseOptions(intent.timeConstraint, intent.sizeConstraint); baseOpts.setSearchMethod(SearchMethod::Indexed); - // --- File name search (always enabled) --- - { + // Determine which search paths to enable based on user intent + const bool enableFileName = (intent.searchTarget == SearchTarget::All + || intent.searchTarget == SearchTarget::FileNameOnly); + const bool enableContent = (intent.searchTarget == SearchTarget::All + || intent.searchTarget == SearchTarget::ContentOnly); + const bool enableOcr = enableContent; // OCR is a content search path + + // --- File name search --- + if (enableFileName) { SearchOptions opts = baseOpts; FileNameOptionsAPI fnameApi(opts); @@ -60,8 +67,8 @@ SemanticSearchPlan SemanticQueryBuilder::build(const ParsedIntent &intent) plan.fileNameOptions = opts; } - // --- Content search (when keywords available) --- - { + // --- Content search (when keywords available and content target enabled) --- + if (enableContent) { const bool hasKeywords = !intent.keywords.isEmpty(); bool contentEnabled = hasKeywords; @@ -102,8 +109,8 @@ SemanticSearchPlan SemanticQueryBuilder::build(const ParsedIntent &intent) } } - // --- OCR search (when keywords available) --- - { + // --- OCR search (when keywords available and content target enabled) --- + if (enableOcr) { const bool hasKeywords = !intent.keywords.isEmpty(); bool ocrEnabled = hasKeywords; From 8a11f48777db85fa72ba9bfb1020c4a17271ce82 Mon Sep 17 00:00:00 2001 From: Zhang Sheng Date: Mon, 18 May 2026 13:55:04 +0800 Subject: [PATCH 18/36] feat: add chinese NLP parsing for relative time and size constraints MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Added support for suffix-only size constraints in Chinese (e.g. "10M 以上", "1G以下") 2. Implemented parsing for dynamic relative time expressions (e.g. "最近 3天", "近2小时") 3. Added unit tests for various combinations: size + filetype, time + filetype 4. Supported Chinese numerals in time and size expressions (e.g. "一百 兆", "近一周") 5. Added locale-aware number conversion for Chinese numerals (e.g. "一" to 1, "二" to 2) Log: Added Chinese NLP support for relative time and size constraint parsing Influence: 1. Test various size constraint patterns with Chinese characters 2. Verify combinations of size/date constraints with different file types 3. Check edge cases like maximum allowed values 4. Test Chinese numeral conversions in all contexts 5. Verify time calculations for relative time expressions feat: 添加中文自然语言处理对相对时间和大小约束的解析支持 1. 添加对中文后缀形式大小约束的支持(例如"10M以上", "1G以下") 2. 实现对动态相对时间表达式的解析(例如"最近3天", "近2小时") 3. 添加多种组合情况的单元测试:大小+文件类型,时间+文件类型 4. 支持时间和大小的中文数字表达式(例如"一百兆", "近一周") 5. 添加针对中文数字的本地化数字转换(例如"一"转1,"二"转2) Log: 新增中文自然语言解析对相对时间和大小约束的支持 Influence: 1. 测试包含中文字符的各种大小约束模式 2. 验证大小/日期约束与不同文件类型的组合情况 3. 检查边界情况如最大允许值 4. 测试所有上下文中的中文数字转换 5. 验证相对时间表达式的时间计算 --- .../dfm-search-tests/tst_chinese_nlp.cpp | 266 ++++++++++++++++++ .../dfm-search-tests/tst_semantic_search.cpp | 2 + .../semantic/extractors/timeextractor.cpp | 78 +++++ .../semantic/extractors/timeextractor.h | 1 + .../semantic/rules/zh_CN/size_rules.json | 24 ++ .../semantic/rules/zh_CN/time_rules.json | 52 ++++ 6 files changed, 423 insertions(+) diff --git a/autotests/dfm-search-tests/tst_chinese_nlp.cpp b/autotests/dfm-search-tests/tst_chinese_nlp.cpp index 0a63d245..65714652 100644 --- a/autotests/dfm-search-tests/tst_chinese_nlp.cpp +++ b/autotests/dfm-search-tests/tst_chinese_nlp.cpp @@ -110,6 +110,10 @@ private Q_SLOTS: void size_combined_withTime(); void size_combined_withType(); void size_combined_full(); + void size_suffix_min(); + void size_suffix_max(); + void size_suffix_combined(); + void size_suffix_chineseUnits(); // Relative time tests void timeRelative_justNow(); @@ -122,6 +126,15 @@ private Q_SLOTS: void timeRelative_aWhileAgo_synonyms(); void timeRelative_priority_vs_preset(); + // Dynamic relative time tests + void timeDynamic_recent_days(); + void timeDynamic_recent_hours(); + void timeDynamic_recent_weeks(); + void timeDynamic_recent_months(); + void timeDynamic_combined_noKeyword(); + void timeDynamic_combined_withType(); + void timeDynamic_chineseNumerals(); + // Action behavior tests void action_create_birthTime(); void action_create_synonyms(); @@ -1208,6 +1221,80 @@ void tst_ChineseNLP::size_combined_full() QVERIFY(intent.fileExtensions.contains("mp4")); } +void tst_ChineseNLP::size_suffix_min() +{ + // Suffix-only min: "10M以上" without prefix keyword + ParsedIntent intent; + m_parser->parse(QStringLiteral("10M以上的文件"), intent); + QVERIFY(intent.sizeConstraint.isValid()); + QCOMPARE(intent.sizeConstraint.minSize, 10485760LL); // 10MB + QCOMPARE(intent.sizeConstraint.maxSize, 0LL); + + // "1G以上" — GB unit + ParsedIntent intent2; + m_parser->parse(QStringLiteral("1G以上的图片"), intent2); + QVERIFY(intent2.sizeConstraint.isValid()); + QCOMPARE(intent2.sizeConstraint.minSize, 1073741824LL); // 1GB + + // "500K以上" — KB unit + ParsedIntent intent3; + m_parser->parse(QStringLiteral("500K以上的文档"), intent3); + QVERIFY(intent3.sizeConstraint.isValid()); + QCOMPARE(intent3.sizeConstraint.minSize, 512000LL); // 500KB +} + +void tst_ChineseNLP::size_suffix_max() +{ + // Suffix-only max: "10M以内" + ParsedIntent intent; + m_parser->parse(QStringLiteral("10M以内的文档"), intent); + QVERIFY(intent.sizeConstraint.isValid()); + QCOMPARE(intent.sizeConstraint.minSize, 0LL); + QCOMPARE(intent.sizeConstraint.maxSize, 10485760LL); // 10MB + + // "1G以下" — "以下" variant + ParsedIntent intent2; + m_parser->parse(QStringLiteral("1G以下的视频"), intent2); + QVERIFY(intent2.sizeConstraint.isValid()); + QCOMPARE(intent2.sizeConstraint.maxSize, 1073741824LL); // 1GB +} + +void tst_ChineseNLP::size_suffix_combined() +{ + // The originally reported bug: "10M以上的表格" + // Should parse both size constraint and filetype + ParsedIntent intent; + m_parser->parse(QStringLiteral("10M以上的表格"), intent); + QVERIFY(intent.sizeConstraint.isValid()); + QCOMPARE(intent.sizeConstraint.minSize, 10485760LL); // 10MB + QVERIFY(intent.fileExtensions.contains("xls")); + QVERIFY(intent.fileExtensions.contains("xlsx")); + QVERIFY(intent.fileExtensions.contains("csv")); + + // "5G以内的压缩包" — size + filetype + ParsedIntent intent2; + m_parser->parse(QStringLiteral("5G以内的压缩包"), intent2); + QVERIFY(intent2.sizeConstraint.isValid()); + QCOMPARE(intent2.sizeConstraint.maxSize, 5368709120LL); // 5GB + QVERIFY(intent2.fileExtensions.contains("zip")); + QVERIFY(intent2.fileExtensions.contains("rar")); +} + +void tst_ChineseNLP::size_suffix_chineseUnits() +{ + // Chinese unit names with suffix: "100兆以上" + ParsedIntent intent; + m_parser->parse(QStringLiteral("100兆以上的文件"), intent); + QVERIFY(intent.sizeConstraint.isValid()); + QCOMPARE(intent.sizeConstraint.minSize, 104857600LL); // 100MB + + // "50千以内" + ParsedIntent intent2; + m_parser->parse(QStringLiteral("50千以内的图片"), intent2); + QVERIFY(intent2.sizeConstraint.isValid()); + QCOMPARE(intent2.sizeConstraint.maxSize, 51200LL); // 50KB +} + // ===== Relative Time Tests ===== void tst_ChineseNLP::timeRelative_justNow() @@ -1312,6 +1399,185 @@ void tst_ChineseNLP::timeRelative_priority_vs_preset() QCOMPARE(intent.timeConstraint.preset, TimePreset::Today); } +// ===== Dynamic Relative Time Tests ===== + +void tst_ChineseNLP::timeDynamic_recent_days() +{ + // "最近3天" — dynamic relative, should consume all 4 chars + ParsedIntent intent; + m_parser->parse(QStringLiteral("最近3天"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Relative); + QCOMPARE(intent.timeConstraint.relativeValue, 3); + QCOMPARE(intent.timeConstraint.relativeUnit, TimeUnit::Days); + // Verify time range: ~3 days ago to now + const qint64 startDelta = qAbs(intent.timeConstraint.customStart.secsTo(QDateTime::currentDateTime())); + QVERIFY2(startDelta >= 259000 && startDelta <= 259300, "Start should be ~3 days ago"); + + // "近3天" — shorter variant + ParsedIntent intent2; + m_parser->parse(QStringLiteral("近3天的图片"), intent2); + QCOMPARE(intent2.timeConstraint.kind, TimeConstraintKind::Relative); + QCOMPARE(intent2.timeConstraint.relativeValue, 3); + QCOMPARE(intent2.timeConstraint.relativeUnit, TimeUnit::Days); + QVERIFY(intent2.fileExtensions.contains("jpg")); + + // "过去7天" — variant + ParsedIntent intent3; + m_parser->parse(QStringLiteral("过去7天"), intent3); + QCOMPARE(intent3.timeConstraint.kind, TimeConstraintKind::Relative); + QCOMPARE(intent3.timeConstraint.relativeValue, 7); + + // "前3天" — variant + ParsedIntent intent4; + m_parser->parse(QStringLiteral("前3天的文档"), intent4); + QCOMPARE(intent4.timeConstraint.kind, TimeConstraintKind::Relative); + QCOMPARE(intent4.timeConstraint.relativeValue, 3); +} + +void tst_ChineseNLP::timeDynamic_recent_hours() +{ + // "最近2小时" — dynamic relative hours + ParsedIntent intent; + m_parser->parse(QStringLiteral("最近2小时的文件"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Relative); + QCOMPARE(intent.timeConstraint.relativeValue, 2); + QCOMPARE(intent.timeConstraint.relativeUnit, TimeUnit::Hours); + + // "近1小时" — shorter variant + ParsedIntent intent2; + m_parser->parse(QStringLiteral("近1小时"), intent2); + QCOMPARE(intent2.timeConstraint.kind, TimeConstraintKind::Relative); + QCOMPARE(intent2.timeConstraint.relativeValue, 1); +} + +void tst_ChineseNLP::timeDynamic_recent_weeks() +{ + // "最近2周" — dynamic relative weeks + ParsedIntent intent; + m_parser->parse(QStringLiteral("最近2周的文档"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Relative); + QCOMPARE(intent.timeConstraint.relativeValue, 2); + QCOMPARE(intent.timeConstraint.relativeUnit, TimeUnit::Weeks); + // ~14 days + const qint64 startDelta = qAbs(intent.timeConstraint.customStart.secsTo(QDateTime::currentDateTime())); + QVERIFY2(startDelta >= 1209000 && startDelta <= 1210000, "Start should be ~2 weeks ago"); +} + +void tst_ChineseNLP::timeDynamic_recent_months() +{ + // "最近3个月" — dynamic relative months + ParsedIntent intent; + m_parser->parse(QStringLiteral("最近3个月的图片"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Relative); + QCOMPARE(intent.timeConstraint.relativeValue, 3); + QCOMPARE(intent.timeConstraint.relativeUnit, TimeUnit::Months); + QVERIFY(intent.fileExtensions.contains("jpg")); + + // "近1月" — shorter variant without "个" + ParsedIntent intent2; + m_parser->parse(QStringLiteral("近1月的文档"), intent2); + QCOMPARE(intent2.timeConstraint.kind, TimeConstraintKind::Relative); + QCOMPARE(intent2.timeConstraint.relativeValue, 1); +} + +void tst_ChineseNLP::timeDynamic_combined_noKeyword() +{ + // The originally reported bug: "最近3天的表格" + // Should parse as: time(recent 3 days) + filetype(spreadsheet) — NO keyword + ParsedIntent intent; + m_parser->parse(QStringLiteral("最近3天的表格"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Relative); + QCOMPARE(intent.timeConstraint.relativeValue, 3); + QCOMPARE(intent.timeConstraint.relativeUnit, TimeUnit::Days); + // Filetype should be matched + QVERIFY(intent.fileExtensions.contains("xls")); + QVERIFY(intent.fileExtensions.contains("xlsx")); + QVERIFY(intent.fileExtensions.contains("csv")); + // No keywords — "3天" is consumed as part of the time expression + QVERIFY2(intent.keywords.isEmpty(), + qPrintable(QStringLiteral("Expected no keywords, got: ") + intent.keywords.join(","))); + + // "过去7天的文档" — same pattern + ParsedIntent intent2; + m_parser->parse(QStringLiteral("过去7天的文档"), intent2); + QCOMPARE(intent2.timeConstraint.kind, TimeConstraintKind::Relative); + QCOMPARE(intent2.timeConstraint.relativeValue, 7); + QVERIFY(!intent2.fileExtensions.isEmpty()); + QVERIFY2(intent2.keywords.isEmpty(), + qPrintable(QStringLiteral("Expected no keywords, got: ") + intent2.keywords.join(","))); +} + +void tst_ChineseNLP::timeDynamic_combined_withType() +{ + // "最近3天的图片和视频" — time + multiple filetypes, no keyword + ParsedIntent intent; + m_parser->parse(QStringLiteral("最近3天的图片和视频"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Relative); + QCOMPARE(intent.timeConstraint.relativeValue, 3); + QVERIFY(intent.fileExtensions.contains("jpg")); + QVERIFY(intent.fileExtensions.contains("mp4")); + QVERIFY2(intent.keywords.isEmpty(), + qPrintable(QStringLiteral("Expected no keywords, got: ") + intent.keywords.join(","))); + + // "近2个月的压缩包" — time + filetype + ParsedIntent intent2; + m_parser->parse(QStringLiteral("近2个月的压缩包"), intent2); + QCOMPARE(intent2.timeConstraint.kind, TimeConstraintKind::Relative); + QCOMPARE(intent2.timeConstraint.relativeValue, 2); + QCOMPARE(intent2.timeConstraint.relativeUnit, TimeUnit::Months); + QVERIFY(intent2.fileExtensions.contains("zip")); + QVERIFY2(intent2.keywords.isEmpty(), + qPrintable(QStringLiteral("Expected no keywords, got: ") + intent2.keywords.join(","))); +} + +void tst_ChineseNLP::timeDynamic_chineseNumerals() +{ + // "最近一周的图片" — 一 = 1 + ParsedIntent intent; + m_parser->parse(QStringLiteral("最近一周的图片"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Relative); + QCOMPARE(intent.timeConstraint.relativeValue, 1); + QCOMPARE(intent.timeConstraint.relativeUnit, TimeUnit::Weeks); + QVERIFY(intent.fileExtensions.contains("jpg")); + QVERIFY(intent.keywords.isEmpty()); + + // "最近两周的表格" — 两 = 2 + ParsedIntent intent2; + m_parser->parse(QStringLiteral("最近两周的表格"), intent2); + QCOMPARE(intent2.timeConstraint.kind, TimeConstraintKind::Relative); + QCOMPARE(intent2.timeConstraint.relativeValue, 2); + QCOMPARE(intent2.timeConstraint.relativeUnit, TimeUnit::Weeks); + QVERIFY(intent2.fileExtensions.contains("xls")); + QVERIFY(intent2.keywords.isEmpty()); + + // "最近三天的文档" — 三 = 3 + ParsedIntent intent3; + m_parser->parse(QStringLiteral("最近三天的文档"), intent3); + QCOMPARE(intent3.timeConstraint.kind, TimeConstraintKind::Relative); + QCOMPARE(intent3.timeConstraint.relativeValue, 3); + QCOMPARE(intent3.timeConstraint.relativeUnit, TimeUnit::Days); + QVERIFY(!intent3.fileExtensions.isEmpty()); + QVERIFY(intent3.keywords.isEmpty()); + + // "近五个月的视频" — 五 = 5 + ParsedIntent intent4; + m_parser->parse(QStringLiteral("近五个月的视频"), intent4); + QCOMPARE(intent4.timeConstraint.kind, TimeConstraintKind::Relative); + QCOMPARE(intent4.timeConstraint.relativeValue, 5); + QCOMPARE(intent4.timeConstraint.relativeUnit, TimeUnit::Months); + QVERIFY(intent4.fileExtensions.contains("mp4")); + QVERIFY(intent4.keywords.isEmpty()); + + // "过去七天" — 七 = 7 + ParsedIntent intent5; + m_parser->parse(QStringLiteral("过去七天"), intent5); + QCOMPARE(intent5.timeConstraint.kind, TimeConstraintKind::Relative); + QCOMPARE(intent5.timeConstraint.relativeValue, 7); + + // Mixed: Arabic + Chinese should still work + // "最近3天" already tested above +} + // ===== Action Behavior Tests ===== void tst_ChineseNLP::action_create_birthTime() diff --git a/autotests/dfm-search-tests/tst_semantic_search.cpp b/autotests/dfm-search-tests/tst_semantic_search.cpp index 86d6cf42..f56577cc 100644 --- a/autotests/dfm-search-tests/tst_semantic_search.cpp +++ b/autotests/dfm-search-tests/tst_semantic_search.cpp @@ -853,6 +853,8 @@ void tst_IsSemanticQuery::sizeDynamic() { QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "大于500M的文件")); QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "小于100K")); + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "10M以上的表格")); + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "1G以内的文档")); } void tst_IsSemanticQuery::timeAndFileType() diff --git a/src/dfm-search/dfm-search-lib/semantic/extractors/timeextractor.cpp b/src/dfm-search/dfm-search-lib/semantic/extractors/timeextractor.cpp index 30501f58..7a795cad 100644 --- a/src/dfm-search/dfm-search-lib/semantic/extractors/timeextractor.cpp +++ b/src/dfm-search/dfm-search-lib/semantic/extractors/timeextractor.cpp @@ -57,6 +57,8 @@ void TimeExtractor::extract(const QString &input, ParsedIntent &intent) parseCustomTime(match, metadata, tc); } else if (typeStr == "relative") { parseRelativeTime(metadata, tc); + } else if (typeStr == "relative_dynamic") { + parseDynamicRelativeTime(match, metadata, tc); } if (tc.isValid()) { @@ -167,6 +169,82 @@ void TimeExtractor::parseRelativeTime(const QVariantMap &metadata, TimeConstrain } } +void TimeExtractor::parseDynamicRelativeTime(const QRegularExpressionMatch &match, + const QVariantMap &metadata, + TimeConstraint &tc) +{ + // Load locale-aware number conversion from rule metadata + const QMap digitMap = mapFromVariant(metadata.value("digit_map")); + const QString tensUnit = metadata.value("tens_unit").toString(); + + // Extract numeric value from any of the named capture groups (value, value2, value3, value4) + int value = 0; + const QStringList captureNames = { QStringLiteral("value"), QStringLiteral("value2"), + QStringLiteral("value3"), QStringLiteral("value4") }; + for (const QString &name : captureNames) { + const QString captured = match.captured(name); + if (!captured.isNull()) { + value = localeAwareToInt(captured, digitMap, tensUnit); + if (value > 0) { + break; + } + value = 0; + } + } + + if (value <= 0 || value > 3650) { + return; + } + + // Determine unit from metadata or capture + const QString unitStr = metadata.value("default_unit").toString(); + TimeUnit unit = TimeUnit::Days; + if (unitStr == "hours") { + unit = TimeUnit::Hours; + } else if (unitStr == "weeks") { + unit = TimeUnit::Weeks; + } else if (unitStr == "months") { + unit = TimeUnit::Months; + } + + const QDateTime now = QDateTime::currentDateTime(); + qint64 totalSeconds = 0; + + switch (unit) { + case TimeUnit::Minutes: + totalSeconds = static_cast(value) * 60; + break; + case TimeUnit::Hours: + totalSeconds = static_cast(value) * 3600; + break; + case TimeUnit::Days: + totalSeconds = static_cast(value) * 86400; + break; + case TimeUnit::Weeks: + totalSeconds = static_cast(value) * 7 * 86400; + break; + case TimeUnit::Months: { + // Approximate: use average days per month + const QDate startDate = now.addMonths(-value).date(); + tc.kind = TimeConstraintKind::Relative; + tc.customStart = QDateTime(startDate, QTime(0, 0, 0)); + tc.customEnd = now; + tc.relativeValue = value; + tc.relativeUnit = unit; + return; + } + case TimeUnit::Years: + totalSeconds = static_cast(value) * 365 * 86400; + break; + } + + tc.kind = TimeConstraintKind::Relative; + tc.customStart = now.addSecs(-totalSeconds); + tc.customEnd = now; + tc.relativeValue = value; + tc.relativeUnit = unit; +} + int TimeExtractor::localeAwareToInt(const QString &input, const QMap &digitMap, const QString &tensUnit) diff --git a/src/dfm-search/dfm-search-lib/semantic/extractors/timeextractor.h b/src/dfm-search/dfm-search-lib/semantic/extractors/timeextractor.h index 66774f6b..c99095bf 100644 --- a/src/dfm-search/dfm-search-lib/semantic/extractors/timeextractor.h +++ b/src/dfm-search/dfm-search-lib/semantic/extractors/timeextractor.h @@ -25,6 +25,7 @@ class TimeExtractor : public DimensionExtractor private: void parseCustomTime(const QRegularExpressionMatch &match, const QVariantMap &metadata, TimeConstraint &tc); void parseRelativeTime(const QVariantMap &metadata, TimeConstraint &tc); + void parseDynamicRelativeTime(const QRegularExpressionMatch &match, const QVariantMap &metadata, TimeConstraint &tc); /** * @brief Convert a string to int using locale-aware digit mapping. diff --git a/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/size_rules.json b/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/size_rules.json index 37e4a4bf..21269c15 100644 --- a/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/size_rules.json +++ b/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/size_rules.json @@ -31,6 +31,30 @@ "include_upper": false } }, + { + "id": "size_dynamic_min_suffix", + "pattern": "(?\\d+(?:\\.\\d+)?)\\s*(?[KkMmGg][Bb]?|兆|字节|千|万)\\s*以上", + "description": "Dynamic size min with suffix only (e.g. 10M以上, 1G以上, 500K以上)", + "enabled": true, + "priority": 160, + "metadata": { + "type": "dynamic", + "direction": "min", + "unit_map": {"兆": "M", "字节": "B", "千": "K", "万": "K"} + } + }, + { + "id": "size_dynamic_max_suffix", + "pattern": "(?\\d+(?:\\.\\d+)?)\\s*(?[KkMmGg][Bb]?|兆|字节|千|万)\\s*(?:以内|以下)", + "description": "Dynamic size max with suffix only (e.g. 10M以内, 1G以下, 500K以内)", + "enabled": true, + "priority": 160, + "metadata": { + "type": "dynamic", + "direction": "max", + "unit_map": {"兆": "M", "字节": "B", "千": "K", "万": "K"} + } + }, { "id": "size_dynamic", "pattern": "(大于|超过|最少|至少|>)\\s*(?\\d+(?:\\.\\d+)?)\\s*(?[KkMmGg][Bb]?|兆|字节|千|万)?以?[上内]?", diff --git a/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/time_rules.json b/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/time_rules.json index bd125279..20f116eb 100644 --- a/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/time_rules.json +++ b/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/time_rules.json @@ -256,6 +256,58 @@ "ago_end_seconds": 0 } }, + { + "id": "time_recent_dynamic_days", + "pattern": "最?近(?[一二两三四五六七八九十百千万\\d]+)\\s*(?:个)?天|过去(?[一二两三四五六七八九十百千万\\d]+)\\s*(?:个)?天|前(?[一二两三四五六七八九十百千万\\d]+)\\s*(?:个)?天|近(?[一二两三四五六七八九十百千万\\d]+)\\s*(?:个)?天", + "description": "Recent N days (e.g. 最近3天, 最近三天, 近1周, 过去七天)", + "enabled": true, + "priority": 150, + "metadata": { + "type": "relative_dynamic", + "default_unit": "days", + "digit_map": {"零": 0, "一": 1, "二": 2, "两": 2, "三": 3, "四": 4, "五": 5, "六": 6, "七": 7, "八": 8, "九": 9, "十": 10}, + "tens_unit": "十" + } + }, + { + "id": "time_recent_dynamic_hours", + "pattern": "最?近(?[一二两三四五六七八九十百千万\\d]+)\\s*(?:个)?小时|过去(?[一二两三四五六七八九十百千万\\d]+)\\s*(?:个)?小时|近(?[一二两三四五六七八九十百千万\\d]+)\\s*(?:个)?小时|前(?[一二两三四五六七八九十百千万\\d]+)\\s*(?:个)?小时", + "description": "Recent N hours (e.g. 最近2小时, 最近两小时, 近一小时)", + "enabled": true, + "priority": 150, + "metadata": { + "type": "relative_dynamic", + "default_unit": "hours", + "digit_map": {"零": 0, "一": 1, "二": 2, "两": 2, "三": 3, "四": 4, "五": 5, "六": 6, "七": 7, "八": 8, "九": 9, "十": 10}, + "tens_unit": "十" + } + }, + { + "id": "time_recent_dynamic_weeks", + "pattern": "最?近(?[一二两三四五六七八九十百千万\\d]+)\\s*(?:个)?周|过去(?[一二两三四五六七八九十百千万\\d]+)\\s*(?:个)?周|近(?[一二两三四五六七八九十百千万\\d]+)\\s*(?:个)?周|前(?[一二两三四五六七八九十百千万\\d]+)\\s*(?:个)?周", + "description": "Recent N weeks (e.g. 最近2周, 最近两周, 近一周)", + "enabled": true, + "priority": 150, + "metadata": { + "type": "relative_dynamic", + "default_unit": "weeks", + "digit_map": {"零": 0, "一": 1, "二": 2, "两": 2, "三": 3, "四": 4, "五": 5, "六": 6, "七": 7, "八": 8, "九": 9, "十": 10}, + "tens_unit": "十" + } + }, + { + "id": "time_recent_dynamic_months", + "pattern": "最?近(?[一二两三四五六七八九十百千万\\d]+)\\s*(?:个)?月|过去(?[一二两三四五六七八九十百千万\\d]+)\\s*(?:个)?月|近(?[一二两三四五六七八九十百千万\\d]+)\\s*(?:个)?月|前(?[一二两三四五六七八九十百千万\\d]+)\\s*(?:个)?月", + "description": "Recent N months (e.g. 最近3个月, 最近三个月, 近一月)", + "enabled": true, + "priority": 150, + "metadata": { + "type": "relative_dynamic", + "default_unit": "months", + "digit_map": {"零": 0, "一": 1, "二": 2, "两": 2, "三": 3, "四": 4, "五": 5, "六": 6, "七": 7, "八": 8, "九": 9, "十": 10}, + "tens_unit": "十" + } + }, { "id": "time_past_few_days", "pattern": "那些天|之前几天|前几天", From 73050bea3a6e225ac3c1902428351e9ec5f1cf77 Mon Sep 17 00:00:00 2001 From: Zhang Sheng Date: Mon, 18 May 2026 16:06:15 +0800 Subject: [PATCH 19/36] feat: add NGram analyzer and tokenizer for Lucene++ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Implemented NGramAnalyzer class for Lucene++ that generates overlapping word n-grams 2. Implemented NGramTokenizer class that performs the actual n-gram generation 3. Added support for configurable min/max n-gram sizes (default 2-4) 4. Includes buffer management for efficient text processing 5. Implements position handling and attribute management for search integration Log: Added NGram analyzer and tokenizer for enhanced text searching Influence: 1. Test search functionality with different min/max gram sizes 2. Verify correct token generation for various input lengths 3. Check buffer handling with large input texts 4. Test position increment behavior in search results 5. Verify case insensitivity in token generation 6. Test edge cases with very short/long input strings feat: 为Lucene++添加NGram分析器和分词器 1. 实现了针对Lucene++的NGramAnalyzer类,用于生成重叠的词n-gram 2. 实现了NGramTokenizer类,执行实际的n-gram生成 3. 添加了可配置的最小/最大n-gram大小支持(默认为2-4) 4. 包含用于高效文本处理的缓冲区管理 5. 实现了搜索集成的位置处理和属性管理 Log: 新增NGram分析器和分词器以增强文本搜索功能 Influence: 1. 使用不同最小/最大gram大小测试搜索功能 2. 验证各种输入长度下正确的token生成 3. 检查大数据量文本下的缓冲区处理 4. 测试搜索结果中的位置增量行为 5. 验证token生成中的大小写不敏感处理 6. 测试极短/极长输入字符串的边缘情况 --- .../dfm-search/lucene++/ngramanalyzer.h | 31 ++++ .../dfm-search/lucene++/ngramtokenizer.h | 55 +++++++ .../dfm-search-lib/lucene++/ngramanalyzer.cpp | 37 +++++ .../lucene++/ngramtokenizer.cpp | 144 ++++++++++++++++++ 4 files changed, 267 insertions(+) create mode 100644 include/dfm-search/dfm-search/lucene++/ngramanalyzer.h create mode 100644 include/dfm-search/dfm-search/lucene++/ngramtokenizer.h create mode 100644 src/dfm-search/dfm-search-lib/lucene++/ngramanalyzer.cpp create mode 100644 src/dfm-search/dfm-search-lib/lucene++/ngramtokenizer.cpp diff --git a/include/dfm-search/dfm-search/lucene++/ngramanalyzer.h b/include/dfm-search/dfm-search/lucene++/ngramanalyzer.h new file mode 100644 index 00000000..2e4646d5 --- /dev/null +++ b/include/dfm-search/dfm-search/lucene++/ngramanalyzer.h @@ -0,0 +1,31 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NGRAMANALYZER_H +#define NGRAMANALYZER_H + +#include + +namespace Lucene { + +class NGramAnalyzer : public Analyzer +{ +public: + explicit NGramAnalyzer(int32_t minGram = 2, int32_t maxGram = 4); + virtual ~NGramAnalyzer(); + + LUCENE_CLASS(NGramAnalyzer); + +public: + virtual TokenStreamPtr tokenStream(const String& fieldName, const ReaderPtr& reader); + virtual TokenStreamPtr reusableTokenStream(const String& fieldName, const ReaderPtr& reader); + +private: + int32_t m_minGram; + int32_t m_maxGram; +}; + +} // namespace Lucene + +#endif // NGRAMANALYZER_H \ No newline at end of file diff --git a/include/dfm-search/dfm-search/lucene++/ngramtokenizer.h b/include/dfm-search/dfm-search/lucene++/ngramtokenizer.h new file mode 100644 index 00000000..19f8cd8b --- /dev/null +++ b/include/dfm-search/dfm-search/lucene++/ngramtokenizer.h @@ -0,0 +1,55 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NGRAMTOKENIZER_H +#define NGRAMTOKENIZER_H + +#include + +namespace Lucene { + +class NGramTokenizer : public Tokenizer +{ +public: + NGramTokenizer(const ReaderPtr& input, int32_t minGram, int32_t maxGram); + NGramTokenizer(const AttributeFactoryPtr& factory, const ReaderPtr& input, + int32_t minGram, int32_t maxGram); + + virtual ~NGramTokenizer(); + + LUCENE_CLASS(NGramTokenizer); + +public: + virtual bool incrementToken(); + virtual void end(); + virtual void reset(); + +private: + void init(); + bool fillBuffer(int32_t need); + + int32_t m_minGram; + int32_t m_maxGram; + + static const int32_t kMaxWordLen = 256; + static const int32_t kIoBufferSize = 1024; + + CharArray m_ioBuffer; + int32_t m_ioLen; + int32_t m_bufferIndex; + bool m_inputExhausted; + + int32_t m_offset; // current position in the logical input stream + int32_t m_gramSize; // current n-gram size being emitted + CharArray m_termBuffer; + + TermAttributePtr m_termAtt; + OffsetAttributePtr m_offsetAtt; + PositionIncrementAttributePtr m_posIncrAtt; + bool m_isFirstTokenAtPosition; // true = positionIncrement=1, false = 0 (same position) +}; + +} // namespace Lucene + +#endif // NGRAMTOKENIZER_H \ No newline at end of file diff --git a/src/dfm-search/dfm-search-lib/lucene++/ngramanalyzer.cpp b/src/dfm-search/dfm-search-lib/lucene++/ngramanalyzer.cpp new file mode 100644 index 00000000..e21f9fe3 --- /dev/null +++ b/src/dfm-search/dfm-search-lib/lucene++/ngramanalyzer.cpp @@ -0,0 +1,37 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#include +#include + +namespace Lucene { + +NGramAnalyzer::NGramAnalyzer(int32_t minGram, int32_t maxGram) + : m_minGram(minGram), m_maxGram(maxGram) +{ +} + +NGramAnalyzer::~NGramAnalyzer() +{ +} + +TokenStreamPtr NGramAnalyzer::tokenStream(const String &fieldName, const ReaderPtr &reader) +{ + return newLucene(reader, m_minGram, m_maxGram); +} + +TokenStreamPtr NGramAnalyzer::reusableTokenStream(const String &fieldName, const ReaderPtr &reader) +{ + LuceneObjectPtr prev = getPreviousTokenStream(); + TokenizerPtr saved(boost::dynamic_pointer_cast(prev)); + if (!saved) { + saved = newLucene(reader, m_minGram, m_maxGram); + setPreviousTokenStream(saved); + } else { + saved->reset(reader); + } + return saved; +} + +} // namespace Lucene diff --git a/src/dfm-search/dfm-search-lib/lucene++/ngramtokenizer.cpp b/src/dfm-search/dfm-search-lib/lucene++/ngramtokenizer.cpp new file mode 100644 index 00000000..4d663322 --- /dev/null +++ b/src/dfm-search/dfm-search-lib/lucene++/ngramtokenizer.cpp @@ -0,0 +1,144 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#include + +#include +#include +#include +#include +#include +#include + +namespace Lucene { + +const int32_t NGramTokenizer::kMaxWordLen; +const int32_t NGramTokenizer::kIoBufferSize; + +NGramTokenizer::NGramTokenizer(const ReaderPtr &input, int32_t minGram, int32_t maxGram) + : Tokenizer(input), m_minGram(minGram), m_maxGram(maxGram), m_isFirstTokenAtPosition(true) +{ + init(); +} + +NGramTokenizer::NGramTokenizer(const AttributeFactoryPtr &factory, const ReaderPtr &input, + int32_t minGram, int32_t maxGram) + : Tokenizer(factory, input), m_minGram(minGram), m_maxGram(maxGram), m_isFirstTokenAtPosition(true) +{ + init(); +} + +NGramTokenizer::~NGramTokenizer() +{ +} + +void NGramTokenizer::init() +{ + m_ioBuffer = CharArray::newInstance(kIoBufferSize); + memset(m_ioBuffer.get(), 0, kIoBufferSize); + m_termBuffer = CharArray::newInstance(m_maxGram); + memset(m_termBuffer.get(), 0, m_maxGram); + + m_termAtt = addAttribute(); + m_offsetAtt = addAttribute(); + m_posIncrAtt = addAttribute(); +} + +void NGramTokenizer::reset() +{ + Tokenizer::reset(); + m_bufferIndex = 0; + m_ioLen = 0; + m_inputExhausted = false; + m_offset = 0; + m_gramSize = m_minGram; + m_isFirstTokenAtPosition = true; +} + +// Ensure at least 'need' chars are available starting from m_bufferIndex +bool NGramTokenizer::fillBuffer(int32_t need) +{ + if (need <= 0) + return true; + + int32_t available = m_ioLen - m_bufferIndex; + if (available >= need) + return true; + + if (m_inputExhausted) + return false; + + // Compact: shift unread data to front of buffer + if (m_bufferIndex > 0) { + int32_t remaining = m_ioLen - m_bufferIndex; + if (remaining > 0) + memmove(m_ioBuffer.get(), m_ioBuffer.get() + m_bufferIndex, remaining * sizeof(wchar_t)); + m_ioLen = remaining; + m_bufferIndex = 0; + } + + // Read more from input + while (m_ioLen < kIoBufferSize) { + int32_t read = input->read(m_ioBuffer.get(), m_ioLen, kIoBufferSize - m_ioLen); + if (read == -1) { + m_inputExhausted = true; + break; + } + m_ioLen += read; + if (m_ioLen - m_bufferIndex >= need) + return true; + } + + return (m_ioLen - m_bufferIndex) >= need; +} + +bool NGramTokenizer::incrementToken() +{ + clearAttributes(); + + while (true) { + // Need m_gramSize chars starting from current buffer position + if (!fillBuffer(m_gramSize)) { + // Not enough chars left — advance to next offset, reset gram size + m_bufferIndex++; + m_offset++; + m_gramSize = m_minGram; + m_isFirstTokenAtPosition = true; + if (!fillBuffer(m_minGram)) + return false; + continue; + } + + int32_t start = m_offset; + + // Emit the n-gram at current buffer position with current gram size + for (int32_t i = 0; i < m_gramSize; ++i) { + m_termBuffer[i] = CharFolder::toLower(m_ioBuffer[m_bufferIndex + i]); + } + m_termAtt->setTermBuffer(m_termBuffer.get(), 0, m_gramSize); + m_offsetAtt->setOffset(correctOffset(start), correctOffset(start + m_gramSize)); + m_posIncrAtt->setPositionIncrement(m_isFirstTokenAtPosition ? 1 : 0); + + // Cycle gram size: 2 → 3 → 4, then advance offset + m_gramSize++; + if (m_gramSize > m_maxGram) { + m_gramSize = m_minGram; + m_bufferIndex++; + m_offset++; + m_isFirstTokenAtPosition = true; + } else { + m_isFirstTokenAtPosition = false; + } + + return true; + } +} + +void NGramTokenizer::end() +{ + int32_t finalOffset = correctOffset(m_offset); + m_offsetAtt->setOffset(finalOffset, finalOffset); +} + +} // namespace Lucene From bd89e8c4940ddc195884fb58f69855ee686275a7 Mon Sep 17 00:00:00 2001 From: Zhang Sheng Date: Mon, 18 May 2026 16:52:44 +0800 Subject: [PATCH 20/36] fix: improve content search engine validation and analyzer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Changed keyword length validation from UTF-8 byte count to character count in ContentSearchEngine 2. Replaced ChineseAnalyzer with NGramAnalyzer(2,2) in IndexedStrategy for better search performance 3. Removed unused highlight function and related ChineseAnalyzer dependency 4. Cleaned up unnecessary headers and code The changes improve search accuracy by validating keyword length based on characters rather than bytes, and enhance search performance by using NGramAnalyzer instead of ChineseAnalyzer. The removed highlight functionality was unused and potentially problematic. Influence: 1. Test search with short keywords to verify proper validation 2. Verify search accuracy with different keyword lengths 3. Test performance with various search queries 4. Ensure search results are still properly highlighted when applicable fix: 改进内容搜索引擎验证和分析器 1. 在 ContentSearchEngine 中优化关键词长度验证方式,从 UTF-8 字节数改为 字符数 2. 将 IndexedStrategy 中的 ChineseAnalyzer 替换为 NGramAnalyzer(2,2),提 升搜索性能 3. 移除未使用的 highlight 功能及相关 ChineseAnalyzer 依赖 4. 清理不必要的头文件和代码 这些改进通过基于字符而非字节的关键词长度验证提高了搜索准确性,并且通过 使用 NGramAnalyzer 替代 ChineseAnalyzer 提升了搜索性能。移除的 highlight 功能未被使用且可能存在隐患。 Influence: 1. 测试短关键词搜索验证正确性 2. 验证不同长度关键词的搜索准确度 3. 测试各种搜索查询的性能表现 4. 确保在适用情况下搜索结果仍能正确高亮显示 --- .../dfm-search/lucene++/ngramanalyzer.h | 10 +-- .../dfm-search/lucene++/ngramtokenizer.h | 3 +- .../contentsearch/contentsearchengine.cpp | 2 +- .../contentstrategies/indexedstrategy.cpp | 4 +- .../lucene++/ngramtokenizer.cpp | 13 +++- .../utils/contenthighlighter.cpp | 67 ------------------- .../dfm-search-lib/utils/contenthighlighter.h | 10 --- 7 files changed, 20 insertions(+), 89 deletions(-) diff --git a/include/dfm-search/dfm-search/lucene++/ngramanalyzer.h b/include/dfm-search/dfm-search/lucene++/ngramanalyzer.h index 2e4646d5..08ca8436 100644 --- a/include/dfm-search/dfm-search/lucene++/ngramanalyzer.h +++ b/include/dfm-search/dfm-search/lucene++/ngramanalyzer.h @@ -12,20 +12,20 @@ namespace Lucene { class NGramAnalyzer : public Analyzer { public: - explicit NGramAnalyzer(int32_t minGram = 2, int32_t maxGram = 4); + explicit NGramAnalyzer(int32_t minGram, int32_t maxGram); virtual ~NGramAnalyzer(); LUCENE_CLASS(NGramAnalyzer); public: - virtual TokenStreamPtr tokenStream(const String& fieldName, const ReaderPtr& reader); - virtual TokenStreamPtr reusableTokenStream(const String& fieldName, const ReaderPtr& reader); + virtual TokenStreamPtr tokenStream(const String &fieldName, const ReaderPtr &reader); + virtual TokenStreamPtr reusableTokenStream(const String &fieldName, const ReaderPtr &reader); private: int32_t m_minGram; int32_t m_maxGram; }; -} // namespace Lucene +} // namespace Lucene -#endif // NGRAMANALYZER_H \ No newline at end of file +#endif // NGRAMANALYZER_H diff --git a/include/dfm-search/dfm-search/lucene++/ngramtokenizer.h b/include/dfm-search/dfm-search/lucene++/ngramtokenizer.h index 19f8cd8b..fa9b57a7 100644 --- a/include/dfm-search/dfm-search/lucene++/ngramtokenizer.h +++ b/include/dfm-search/dfm-search/lucene++/ngramtokenizer.h @@ -23,6 +23,8 @@ class NGramTokenizer : public Tokenizer public: virtual bool incrementToken(); virtual void end(); + + using Tokenizer::reset; virtual void reset(); private: @@ -32,7 +34,6 @@ class NGramTokenizer : public Tokenizer int32_t m_minGram; int32_t m_maxGram; - static const int32_t kMaxWordLen = 256; static const int32_t kIoBufferSize = 1024; CharArray m_ioBuffer; diff --git a/src/dfm-search/dfm-search-lib/contentsearch/contentsearchengine.cpp b/src/dfm-search/dfm-search-lib/contentsearch/contentsearchengine.cpp index bca9df31..d992598e 100644 --- a/src/dfm-search/dfm-search-lib/contentsearch/contentsearchengine.cpp +++ b/src/dfm-search/dfm-search-lib/contentsearch/contentsearchengine.cpp @@ -42,7 +42,7 @@ SearchError ContentSearchEngine::validateSearchConditions() } if (m_currentQuery.type() == SearchQuery::Type::Simple - && m_currentQuery.keyword().toUtf8().size() < Global::kMinContentSearchKeywordLength + && m_currentQuery.keyword().size() < Global::kMinContentSearchKeywordLength && api.filenameKeyword().isEmpty()) { return SearchError(ContentSearchErrorCode::KeywordTooShort); } diff --git a/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.cpp b/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.cpp index f3fc4200..d6df0e11 100644 --- a/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.cpp +++ b/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.cpp @@ -18,8 +18,8 @@ #include #include +#include -#include "3rdparty/fulltext/chineseanalyzer.h" #include "utils/cancellablecollector.h" #include "utils/contenthighlighter.h" #include "utils/lucenequeryutils.h" @@ -496,7 +496,7 @@ void ContentIndexedStrategy::performContentSearch(const SearchQuery &query) IndexSearcherPtr searcher = newLucene(reader); // 创建分析器 - AnalyzerPtr analyzer = newLucene(); + AnalyzerPtr analyzer = newLucene(2, 2); // 构建查询 m_currentQuery = buildLuceneQuery(query, analyzer, m_options.searchPath()); diff --git a/src/dfm-search/dfm-search-lib/lucene++/ngramtokenizer.cpp b/src/dfm-search/dfm-search-lib/lucene++/ngramtokenizer.cpp index 4d663322..b48f70ae 100644 --- a/src/dfm-search/dfm-search-lib/lucene++/ngramtokenizer.cpp +++ b/src/dfm-search/dfm-search-lib/lucene++/ngramtokenizer.cpp @@ -4,6 +4,8 @@ #include +#include + #include #include #include @@ -13,18 +15,23 @@ namespace Lucene { -const int32_t NGramTokenizer::kMaxWordLen; const int32_t NGramTokenizer::kIoBufferSize; NGramTokenizer::NGramTokenizer(const ReaderPtr &input, int32_t minGram, int32_t maxGram) - : Tokenizer(input), m_minGram(minGram), m_maxGram(maxGram), m_isFirstTokenAtPosition(true) + : Tokenizer(input), + m_minGram(std::min(minGram, maxGram)), + m_maxGram(std::max(minGram, maxGram)), + m_isFirstTokenAtPosition(true) { init(); } NGramTokenizer::NGramTokenizer(const AttributeFactoryPtr &factory, const ReaderPtr &input, int32_t minGram, int32_t maxGram) - : Tokenizer(factory, input), m_minGram(minGram), m_maxGram(maxGram), m_isFirstTokenAtPosition(true) + : Tokenizer(factory, input), + m_minGram(std::min(minGram, maxGram)), + m_maxGram(std::max(minGram, maxGram)), + m_isFirstTokenAtPosition(true) { init(); } diff --git a/src/dfm-search/dfm-search-lib/utils/contenthighlighter.cpp b/src/dfm-search/dfm-search-lib/utils/contenthighlighter.cpp index 889e44f5..cc95dd14 100644 --- a/src/dfm-search/dfm-search-lib/utils/contenthighlighter.cpp +++ b/src/dfm-search/dfm-search-lib/utils/contenthighlighter.cpp @@ -14,35 +14,12 @@ #include #include -#include "chineseanalyzer.h" - using namespace Lucene; DFM_SEARCH_BEGIN_NS namespace ContentHighlighter { -namespace { -QString mergeAdjacentHighlightTags(const QString &text) -{ - // 使用正则表达式搜索和替换相邻的高亮标签 - QString result = text; - - // 替换模式: 将被删除,从而合并相邻的标签 - static const QString pattern = QLatin1String(""); - static const QString replacement = QLatin1String(""); - - // 循环替换直到不再有变化(处理连续多个标签的情况) - QString previousResult; - do { - previousResult = result; - result = result.replace(pattern, replacement); - } while (result != previousResult); - - return result; -} -} // namespace - namespace { struct KeywordMatch @@ -265,50 +242,6 @@ QString customHighlight(const QStringList &keywords, const QString &content, int return resultSnippet; } -QString highlight(const QString &content, const Lucene::QueryPtr &query, int maxLength, bool enableHtml) -{ - try { - if (content.isEmpty()) { - return {}; - } - - // 尝试使用Lucene高亮器 - FormatterPtr formatter; - if (enableHtml) { - formatter = newLucene(L"", L""); - } else { - formatter = newLucene(L"", L""); - } - HighlighterScorerPtr scorer = newLucene(query); - HighlighterPtr highlighter = newLucene(formatter, scorer); - - // 创建分析器 - AnalyzerPtr analyzer = newLucene(); - - TokenStreamPtr tokenStream = analyzer->tokenStream(L"contents", newLucene(content.toStdWString())); - Collection fragments = highlighter->getBestFragments(tokenStream, content.toStdWString(), 1); - - QString result; - if (!fragments.empty() && !fragments[0].empty()) { - // Lucene高亮成功,使用其结果 - result = QString::fromStdWString(fragments[0]); - } else { - // TODO: Lucene高亮失败,使用自定义高亮方法 - // result = customHighlight(content, query, contentLength); - } - - // 处理连续的高亮标签 - if (enableHtml) { - result = mergeAdjacentHighlightTags(result); - } - - return result.simplified(); - } catch (const LuceneException &e) { - qWarning() << "Highlighting failed:" << QString::fromStdWString(e.getError()); - return QStringLiteral("(Error highlighting content)"); - } -} - } // namespace ContentHighlighter DFM_SEARCH_END_NS diff --git a/src/dfm-search/dfm-search-lib/utils/contenthighlighter.h b/src/dfm-search/dfm-search-lib/utils/contenthighlighter.h index e35a5241..e926fd66 100644 --- a/src/dfm-search/dfm-search-lib/utils/contenthighlighter.h +++ b/src/dfm-search/dfm-search-lib/utils/contenthighlighter.h @@ -39,16 +39,6 @@ namespace ContentHighlighter { */ QString customHighlight(const QStringList &keywords, const QString &content, int maxLength, bool enableHtml); -/** - * @brief 高亮搜索结果中的关键词 - * @param content 要高亮的内容 - * @param query Lucene查询对象 - * @param maxLength 最大显示长度 - * @param enableHtml 是否启用HTML标签高亮,默认为false - * @return 高亮后的内容 - */ -QString highlight(const QString &content, const Lucene::QueryPtr &query, int maxLength, bool enableHtml); - } // namespace ContentHighlighter DFM_SEARCH_END_NS From b7d77d9e6f09dd4f1bfeddc8c73fea86aba477ce Mon Sep 17 00:00:00 2001 From: Zhang Sheng Date: Tue, 19 May 2026 10:14:23 +0800 Subject: [PATCH 21/36] refactor: optimize search filtering and query building MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Move path filtering, exclusion and hidden file checks to Lucene query layer for better performance 2. Remove obsolete SearchUtility ancestor paths support checks since all indexes now support this feature 3. Pre-allocate result vectors to avoid reallocations during append 4. Use move semantics for SearchResult objects where applicable 5. Remove unused searchPath parameter from query building methods 6. Consolidate path and permission checks into single filtering step 7. Remove unused SearchUtility headers and functionality Log: Optimized search performance by moving filtering to query layer Influence: 1. Test content search with various path filters and exclusions 2. Verify hidden file filtering works correctly 3. Test OCR text search performance with large results 4. Verify filename search maintains all previous functionality 5. Check all types of searches with verbose mode enabled 6. Test with multiple search paths and complex exclusion paths refactor: 优化搜索过滤和查询构建逻辑 1. 将路径过滤、排除和隐藏文件检查移至 Lucene 查询层以提高性能 2. 移除过时的 SearchUtility 祖先路径支持检查,所有索引现在均支持此功能 3. 预分配结果向量以避免追加时的重新分配 4. 适用处使用移动语义处理 SearchResult 对象 5. 从查询构建方法中移除未使用的 searchPath 参数 6. 将路径和权限检查整合为单个过滤步骤 7. 移除未使用的 SearchUtility 头文件和相关功能 Log: 通过将过滤移至查询层优化了搜索性能 Influence: 1. 测试带有各种路径过滤和排除的内容搜索 2. 验证隐藏文件过滤功能正常工作 3. 测试包含大量结果的OCR文本搜索性能 4. 验证文件名搜索保持所有原有功能 5. 测试启用详细模式的所有类型搜索 6. 测试包含多个搜索路径和复杂排除路径的情况 --- src/dfm-search/dfm-search-client/main.cpp | 8 +- .../contentstrategies/indexedstrategy.cpp | 169 ++++++++---------- .../contentstrategies/indexedstrategy.h | 2 +- .../filenamestrategies/indexedstrategy.cpp | 49 +++-- .../filenamestrategies/indexedstrategy.h | 4 +- .../ocrtextstrategies/indexedstrategy.cpp | 166 ++++++++--------- .../ocrtextstrategies/indexedstrategy.h | 2 +- .../dfm-search-lib/utils/searchutility.cpp | 22 --- .../dfm-search-lib/utils/searchutility.h | 21 --- 9 files changed, 179 insertions(+), 264 deletions(-) diff --git a/src/dfm-search/dfm-search-client/main.cpp b/src/dfm-search/dfm-search-client/main.cpp index f5335c43..f6d7d4f9 100644 --- a/src/dfm-search/dfm-search-client/main.cpp +++ b/src/dfm-search/dfm-search-client/main.cpp @@ -51,8 +51,8 @@ static void configureSearchOptions(SearchOptions &options, const SearchCliConfig } else if (config.searchType == SearchType::Content) { ContentOptionsAPI contentOptions(options); contentOptions.setMaxPreviewLength(config.maxPreviewLength); - contentOptions.setFullTextRetrievalEnabled(true); - contentOptions.setSearchResultHighlightEnabled(true); + contentOptions.setFullTextRetrievalEnabled(config.verbose); + contentOptions.setSearchResultHighlightEnabled(config.verbose); contentOptions.setFilenameContentMixedAndSearchEnabled(true); if (!config.filenameKeyword.isEmpty()) { contentOptions.setFilenameKeyword(config.filenameKeyword); @@ -60,8 +60,8 @@ static void configureSearchOptions(SearchOptions &options, const SearchCliConfig } else if (config.searchType == SearchType::Ocr) { OcrTextOptionsAPI ocrTextOptions(options); ocrTextOptions.setMaxPreviewLength(config.maxPreviewLength); - ocrTextOptions.setFullTextRetrievalEnabled(true); - ocrTextOptions.setSearchResultHighlightEnabled(true); + ocrTextOptions.setFullTextRetrievalEnabled(config.verbose); + ocrTextOptions.setSearchResultHighlightEnabled(config.verbose); ocrTextOptions.setFilenameOcrContentMixedAndSearchEnabled(true); if (!config.filenameKeyword.isEmpty()) { ocrTextOptions.setFilenameKeyword(config.filenameKeyword); diff --git a/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.cpp b/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.cpp index d6df0e11..1d6c088f 100644 --- a/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.cpp +++ b/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.cpp @@ -23,7 +23,6 @@ #include "utils/cancellablecollector.h" #include "utils/contenthighlighter.h" #include "utils/lucenequeryutils.h" -#include "utils/searchutility.h" #include "utils/lucene_cancellation_compat.h" #include "utils/timerangeutils.h" @@ -64,7 +63,7 @@ void ContentIndexedStrategy::search(const SearchQuery &query) } } -Lucene::QueryPtr ContentIndexedStrategy::buildLuceneQuery(const SearchQuery &query, const Lucene::AnalyzerPtr &analyzer, const QString &searchPath) +Lucene::QueryPtr ContentIndexedStrategy::buildLuceneQuery(const SearchQuery &query, const Lucene::AnalyzerPtr &analyzer) { try { m_keywords.clear(); @@ -100,9 +99,41 @@ Lucene::QueryPtr ContentIndexedStrategy::buildLuceneQuery(const SearchQuery &que mainQuery = newLucene(); // Should not happen } + // Add filename keyword query (before filters, so it replaces empty content query correctly) + QString filenameKw = optAPI.filenameKeyword(); + if (!filenameKw.isEmpty()) { + Lucene::QueryParserPtr filenameParser = newLucene( + Lucene::LuceneVersion::LUCENE_CURRENT, + LuceneFieldNames::Content::kFilename, + analyzer); + Lucene::QueryPtr filenameQuery = filenameParser->parse( + LuceneQueryUtils::processQueryString(filenameKw, false)); + + if (filenameQuery) { + // Check if content keywords are effectively empty + bool noContentKeywords = (query.type() == SearchQuery::Type::Simple) + ? query.keyword().isEmpty() + : (query.subQueries().isEmpty() + || std::all_of(query.subQueries().cbegin(), query.subQueries().cend(), + [](const auto &sq) { return sq.keyword().isEmpty(); })); + + if (noContentKeywords) { + // Filename-only search: replace empty content query with filename query + mainQuery = filenameQuery; + } else if (mainQuery) { + // Both content and filename: AND combination + BooleanQueryPtr finalQuery = newLucene(); + finalQuery->add(mainQuery, BooleanClause::MUST); + finalQuery->add(filenameQuery, BooleanClause::MUST); + mainQuery = finalQuery; + } + m_keywords.append(filenameKw); + } + } + // Add path prefix query optimization QStringList searchPathsList = m_options.searchPaths(); - if (mainQuery && SearchUtility::isContentIndexAncestorPathsSupported()) { + if (mainQuery) { QueryPtr pathPrefixQuery = LuceneQueryUtils::buildMultiPathPrefixQuery( searchPathsList, QString::fromWCharArray(LuceneFieldNames::Content::kAncestorPaths)); @@ -115,6 +146,34 @@ Lucene::QueryPtr ContentIndexedStrategy::buildLuceneQuery(const SearchQuery &que } } + // Add excluded paths filter (pushed down to query layer to avoid per-doc filtering) + if (mainQuery) { + const QStringList &excludedPaths = m_options.searchExcludedPaths(); + if (!excludedPaths.isEmpty()) { + QueryPtr excludedQuery = LuceneQueryUtils::buildMultiPathPrefixQuery( + excludedPaths, + QString::fromWCharArray(LuceneFieldNames::Content::kAncestorPaths)); + if (excludedQuery) { + BooleanQueryPtr finalQuery = newLucene(); + finalQuery->add(mainQuery, BooleanClause::MUST); + finalQuery->add(excludedQuery, BooleanClause::MUST_NOT); + mainQuery = finalQuery; + } + } + } + + // Add hidden file filter (pushed down to query layer) + if (mainQuery && !m_options.includeHidden()) { + QueryPtr hiddenQuery = Lucene::newLucene( + Lucene::newLucene( + LuceneFieldNames::Content::kIsHidden, + L"Y")); + BooleanQueryPtr finalQuery = newLucene(); + finalQuery->add(mainQuery, BooleanClause::MUST); + finalQuery->add(hiddenQuery, BooleanClause::MUST_NOT); + mainQuery = finalQuery; + } + // Add time range filter query if (m_options.hasTimeRangeFilter()) { TimeRangeFilter filter = m_options.timeRangeFilter(); @@ -157,38 +216,6 @@ Lucene::QueryPtr ContentIndexedStrategy::buildLuceneQuery(const SearchQuery &que } } - // Add filename keyword query - QString filenameKw = optAPI.filenameKeyword(); - if (!filenameKw.isEmpty()) { - Lucene::QueryParserPtr filenameParser = newLucene( - Lucene::LuceneVersion::LUCENE_CURRENT, - LuceneFieldNames::Content::kFilename, - analyzer); - Lucene::QueryPtr filenameQuery = filenameParser->parse( - LuceneQueryUtils::processQueryString(filenameKw, false)); - - if (filenameQuery) { - // Check if content keywords are effectively empty - bool noContentKeywords = (query.type() == SearchQuery::Type::Simple) - ? query.keyword().isEmpty() - : (query.subQueries().isEmpty() - || std::all_of(query.subQueries().cbegin(), query.subQueries().cend(), - [](const auto &sq) { return sq.keyword().isEmpty(); })); - - if (noContentKeywords) { - // Filename-only search: use filename query directly - mainQuery = filenameQuery; - } else if (mainQuery) { - // Both content and filename: AND combination - BooleanQueryPtr finalQuery = newLucene(); - finalQuery->add(mainQuery, BooleanClause::MUST); - finalQuery->add(filenameQuery, BooleanClause::MUST); - mainQuery = finalQuery; - } - m_keywords.append(filenameKw); - } - } - return mainQuery; } catch (const Lucene::LuceneException &e) { @@ -292,9 +319,6 @@ void ContentIndexedStrategy::processSearchResults(const Lucene::IndexSearcherPtr QElapsedTimer resultTimer; resultTimer.start(); - QString searchPath = m_options.searchPath(); - const QStringList &searchExcludedPaths = m_options.searchExcludedPaths(); - QStringList allSearchPaths = m_options.searchPaths(); auto docsSize = scoreDocs.size(); ContentOptionsAPI optAPI(m_options); @@ -302,6 +326,9 @@ void ContentIndexedStrategy::processSearchResults(const Lucene::IndexSearcherPtr int previewLen = optAPI.maxPreviewLength() > 0 ? optAPI.maxPreviewLength() : 50; bool enableRetrieval = optAPI.isFullTextRetrievalEnabled(); + // Pre-allocate to avoid reallocation during append + m_results.reserve(m_results.size() + static_cast(docsSize)); + for (int32_t i = 0; i < docsSize; ++i) { if (m_cancelled.load()) { qInfo() << "Content search cancelled"; @@ -310,18 +337,11 @@ void ContentIndexedStrategy::processSearchResults(const Lucene::IndexSearcherPtr try { Lucene::ScoreDocPtr scoreDoc = scoreDocs[i]; - if (!scoreDoc) { - qWarning() << "Null ScoreDoc encountered at index" << i; - continue; - } - - // Defensive check: verify document ID is valid - if (scoreDoc->doc < 0) { - qWarning() << "Invalid document ID:" << scoreDoc->doc; + if (!scoreDoc || scoreDoc->doc < 0) { + qWarning() << "Invalid ScoreDoc at index" << i; continue; } - // Safely retrieve document (could throw if index is corrupted) Lucene::DocumentPtr doc; try { doc = searcher->doc(scoreDoc->doc); @@ -337,55 +357,18 @@ void ContentIndexedStrategy::processSearchResults(const Lucene::IndexSearcherPtr continue; } - // Safely get path - Lucene::String pathField; - try { - pathField = doc->get(LuceneFieldNames::Content::kPath); - if (pathField.empty()) { - qWarning() << "Document missing path field at index:" << scoreDoc->doc; - continue; - } - } catch (const std::exception &e) { - qWarning() << "Exception retrieving path field:" << e.what(); - continue; - } - - QString path = QString::fromStdWString(pathField); - - // Check against all search paths - if (!std::any_of(allSearchPaths.cbegin(), allSearchPaths.cend(), - [&path](const auto &sp) { return path.startsWith(sp); })) { - continue; - } - - if (std::any_of(searchExcludedPaths.cbegin(), searchExcludedPaths.cend(), - [&path](const auto &excluded) { return path.startsWith(excluded); })) { + // Path filtering, hidden file exclusion — handled at query layer + Lucene::String pathField = doc->get(LuceneFieldNames::Content::kPath); + if (pathField.empty()) { + qWarning() << "Document missing path field at index:" << scoreDoc->doc; continue; } - // Safely check hidden status - if (Q_LIKELY(!m_options.includeHidden())) { - try { - Lucene::String hiddenField = doc->get(LuceneFieldNames::Content::kIsHidden); - if (!hiddenField.empty() && QString::fromStdWString(hiddenField).toLower() == "y") { - continue; - } - } catch (const std::exception &e) { - qWarning() << "Exception retrieving is_hidden field:" << e.what(); - // Default to visible if field can't be read - } - } - - // 创建搜索结果 - SearchResult result(path); - - // 设置内容结果 + SearchResult result(QString::fromStdWString(pathField)); ContentResultAPI resultApi(result); - // 使用ContentHighlighter命名空间进行高亮 if (enableRetrieval) { try { - // Safely get contents with null check Lucene::String contentField = doc->get(LuceneFieldNames::Content::kContents); if (!contentField.empty()) { const QString content = QString::fromStdWString(contentField); @@ -394,10 +377,8 @@ void ContentIndexedStrategy::processSearchResults(const Lucene::IndexSearcherPtr } } catch (const Lucene::LuceneException &e) { qWarning() << "Exception retrieving content field:" << QString::fromStdWString(e.getError()); - // Continue without content highlight } catch (const std::exception &e) { qWarning() << "Standard exception retrieving content field:" << e.what(); - // Continue without content highlight } } @@ -447,11 +428,11 @@ void ContentIndexedStrategy::processSearchResults(const Lucene::IndexSearcherPtr } // 添加到结果集合 - m_results.append(result); + m_results.append(std::move(result)); // 实时发送结果 if (Q_UNLIKELY(m_options.resultFoundEnabled())) - emit resultFound(result); + emit resultFound(m_results.last()); } catch (const Lucene::LuceneException &e) { qWarning() << "Error processing result:" << QString::fromStdWString(e.getError()); @@ -499,7 +480,7 @@ void ContentIndexedStrategy::performContentSearch(const SearchQuery &query) AnalyzerPtr analyzer = newLucene(2, 2); // 构建查询 - m_currentQuery = buildLuceneQuery(query, analyzer, m_options.searchPath()); + m_currentQuery = buildLuceneQuery(query, analyzer); if (!m_currentQuery) { qWarning() << "Failed to build Lucene query"; emit errorOccurred(SearchError(ContentSearchErrorCode::ContentIndexException)); diff --git a/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.h b/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.h index 724474ed..39028af6 100644 --- a/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.h +++ b/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.h @@ -39,7 +39,7 @@ class ContentIndexedStrategy : public ContentBaseStrategy void performContentSearch(const SearchQuery &query); // Build Lucene query - Lucene::QueryPtr buildLuceneQuery(const SearchQuery &query, const Lucene::AnalyzerPtr &analyzer, const QString &searchPath); + Lucene::QueryPtr buildLuceneQuery(const SearchQuery &query, const Lucene::AnalyzerPtr &analyzer); // Helper for simple queries (original logic for "contents" field) Lucene::QueryPtr buildSimpleContentsQuery( const SearchQuery &query, diff --git a/src/dfm-search/dfm-search-lib/filenamesearch/filenamestrategies/indexedstrategy.cpp b/src/dfm-search/dfm-search-lib/filenamesearch/filenamestrategies/indexedstrategy.cpp index 44c955c2..45487e71 100644 --- a/src/dfm-search/dfm-search-lib/filenamesearch/filenamestrategies/indexedstrategy.cpp +++ b/src/dfm-search/dfm-search-lib/filenamesearch/filenamestrategies/indexedstrategy.cpp @@ -316,8 +316,6 @@ void FileNameIndexedStrategy::search(const SearchQuery &query) void FileNameIndexedStrategy::performIndexSearch(const SearchQuery &query, const FileNameOptionsAPI &api) { bool caseSensitive = m_options.caseSensitive(); - const QString &searchPath = m_options.searchPath(); - const QStringList &searchExcludedPaths = m_options.searchExcludedPaths(); QStringList fileTypes = api.fileTypes(); QStringList fileExtensions = api.fileExtensions(); @@ -331,7 +329,7 @@ void FileNameIndexedStrategy::performIndexSearch(const SearchQuery &query, const IndexQuery indexQuery = buildIndexQuery(query, searchType, caseSensitive, pinyinEnabled, pinyinAcronymEnabled, fileTypes, fileExtensions); // 3. 执行查询并处理结果 - executeIndexQuery(indexQuery, searchPath, searchExcludedPaths); + executeIndexQuery(indexQuery); } FileNameIndexedStrategy::SearchType FileNameIndexedStrategy::determineSearchType( @@ -446,7 +444,7 @@ FileNameIndexedStrategy::IndexQuery FileNameIndexedStrategy::buildIndexQuery( return result; } -void FileNameIndexedStrategy::executeIndexQuery(const IndexQuery &query, const QString &searchPath, const QStringList &searchExcludedPaths) +void FileNameIndexedStrategy::executeIndexQuery(const IndexQuery &query) { // 获取索引目录 FSDirectoryPtr directory = m_indexManager->getIndexDirectory(m_indexDir); @@ -479,7 +477,7 @@ void FileNameIndexedStrategy::executeIndexQuery(const IndexQuery &query, const Q // 构建查询 QueryPtr luceneQuery; try { - luceneQuery = buildLuceneQuery(query, searchPath); + luceneQuery = buildLuceneQuery(query); if (!luceneQuery) { emit errorOccurred(SearchError(SearchErrorCode::InvalidQuery)); return; @@ -522,9 +520,6 @@ void FileNameIndexedStrategy::executeIndexQuery(const IndexQuery &query, const Q auto docsSize = scoreDocs.size(); m_results.reserve(docsSize); - // Get all search paths for post-filtering - QStringList allSearchPaths = m_options.searchPaths(); - // 实时处理搜索结果 for (int i = 0; i < docsSize; i++) { if (m_cancelled.load()) { @@ -537,29 +532,14 @@ void FileNameIndexedStrategy::executeIndexQuery(const IndexQuery &query, const Q DocumentPtr doc = searcher->doc(scoreDoc->doc); QString path = QString::fromStdWString(doc->get(LuceneFieldNames::FileName::kFullPath)); - // Check against all search paths - if (!std::any_of(allSearchPaths.cbegin(), allSearchPaths.cend(), - [&path](const auto &sp) { return path.startsWith(sp); })) { - continue; - } - - if (std::any_of(searchExcludedPaths.cbegin(), searchExcludedPaths.cend(), - [&path](const auto &excluded) { return path.startsWith(excluded); })) { - continue; - } - - if (Q_LIKELY(!m_options.includeHidden())) { - if (QString::fromStdWString(doc->get(LuceneFieldNames::FileName::kIsHidden)).toLower() == "y") - continue; - } + // Path filtering, excluded paths, hidden file — handled at query layer // 处理搜索结果 if (Q_UNLIKELY(m_options.detailedResultsEnabled())) { m_results.append(processDetailedSearchResult(path, doc)); } else { - // perf: quickly SearchResult result(path); - m_results.append(result); + m_results.append(std::move(result)); } // 实时发送结果 @@ -637,7 +617,7 @@ SearchResult FileNameIndexedStrategy::processDetailedSearchResult( return result; } -Lucene::QueryPtr FileNameIndexedStrategy::buildLuceneQuery(const IndexQuery &query, const QString &searchPath) const +Lucene::QueryPtr FileNameIndexedStrategy::buildLuceneQuery(const IndexQuery &query) const { BooleanQueryPtr finalQuery = newLucene(); bool hasValidQuery = false; @@ -781,7 +761,7 @@ Lucene::QueryPtr FileNameIndexedStrategy::buildLuceneQuery(const IndexQuery &que // Add path prefix query optimization QStringList searchPathsList = m_options.searchPaths(); - if (hasValidQuery && SearchUtility::isFilenameIndexAncestorPathsSupported()) { + if (hasValidQuery) { QueryPtr pathPrefixQuery = LuceneQueryUtils::buildMultiPathPrefixQuery( searchPathsList, QString::fromWCharArray(LuceneFieldNames::FileName::kAncestorPaths)); @@ -791,12 +771,25 @@ Lucene::QueryPtr FileNameIndexedStrategy::buildLuceneQuery(const IndexQuery &que } } + // Add excluded paths filter (pushed down to query layer) + if (hasValidQuery) { + const QStringList &excludedPaths = m_options.searchExcludedPaths(); + if (!excludedPaths.isEmpty()) { + QueryPtr excludedQuery = LuceneQueryUtils::buildMultiPathPrefixQuery( + excludedPaths, + QString::fromWCharArray(LuceneFieldNames::FileName::kAncestorPaths)); + if (excludedQuery) { + finalQuery->add(excludedQuery, BooleanClause::MUST_NOT); + } + } + } + // Filter hidden files at query level to avoid losing results due to maxResults limit if (hasValidQuery && Q_LIKELY(!m_options.includeHidden())) { QueryPtr hiddenQuery = Lucene::newLucene( Lucene::newLucene( LuceneFieldNames::FileName::kIsHidden, - Lucene::StringUtils::toUnicode("Y"))); + L"Y")); finalQuery->add(hiddenQuery, Lucene::BooleanClause::MUST_NOT); } diff --git a/src/dfm-search/dfm-search-lib/filenamesearch/filenamestrategies/indexedstrategy.h b/src/dfm-search/dfm-search-lib/filenamesearch/filenamestrategies/indexedstrategy.h index 2d4f143f..c9f7c763 100644 --- a/src/dfm-search/dfm-search-lib/filenamesearch/filenamestrategies/indexedstrategy.h +++ b/src/dfm-search/dfm-search-lib/filenamesearch/filenamestrategies/indexedstrategy.h @@ -83,10 +83,10 @@ class FileNameIndexedStrategy : public FileNameBaseStrategy const QStringList &fileExtensions); // 执行索引查询并处理结果 - void executeIndexQuery(const IndexQuery &query, const QString &searchPath, const QStringList &searchExcludedPaths); + void executeIndexQuery(const IndexQuery &query); // 构建 Lucene 查询 - QueryPtr buildLuceneQuery(const IndexQuery &query, const QString &searchPath) const; + QueryPtr buildLuceneQuery(const IndexQuery &query) const; // 构建布尔查询的辅助方法 BooleanQueryPtr buildBooleanTermsQuery(const IndexQuery &query, const AnalyzerPtr &analyzer) const; diff --git a/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.cpp b/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.cpp index da9aceda..705c0063 100644 --- a/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.cpp +++ b/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.cpp @@ -22,7 +22,6 @@ #include "utils/cancellablecollector.h" #include "utils/contenthighlighter.h" #include "utils/lucenequeryutils.h" -#include "utils/searchutility.h" #include "utils/lucene_cancellation_compat.h" #include "utils/timerangeutils.h" @@ -63,7 +62,7 @@ void OcrTextIndexedStrategy::search(const SearchQuery &query) } } -Lucene::QueryPtr OcrTextIndexedStrategy::buildLuceneQuery(const SearchQuery &query, const Lucene::AnalyzerPtr &analyzer, const QString &searchPath) +Lucene::QueryPtr OcrTextIndexedStrategy::buildLuceneQuery(const SearchQuery &query, const Lucene::AnalyzerPtr &analyzer) { try { m_keywords.clear(); @@ -97,9 +96,41 @@ Lucene::QueryPtr OcrTextIndexedStrategy::buildLuceneQuery(const SearchQuery &que mainQuery = newLucene(); // Should not happen } + // Add filename keyword query (before filters, so it replaces empty content query correctly) + QString filenameKw = optAPI.filenameKeyword(); + if (!filenameKw.isEmpty()) { + Lucene::QueryParserPtr filenameParser = newLucene( + Lucene::LuceneVersion::LUCENE_CURRENT, + LuceneFieldNames::OcrText::kFilename, + analyzer); + Lucene::QueryPtr filenameQuery = filenameParser->parse( + LuceneQueryUtils::processQueryString(filenameKw, false)); + + if (filenameQuery) { + // Check if content keywords are effectively empty + bool noContentKeywords = (query.type() == SearchQuery::Type::Simple) + ? query.keyword().isEmpty() + : (query.subQueries().isEmpty() + || std::all_of(query.subQueries().cbegin(), query.subQueries().cend(), + [](const auto &sq) { return sq.keyword().isEmpty(); })); + + if (noContentKeywords) { + // Filename-only search: replace empty content query with filename query + mainQuery = filenameQuery; + } else if (mainQuery) { + // Both content and filename: AND combination + BooleanQueryPtr finalQuery = newLucene(); + finalQuery->add(mainQuery, BooleanClause::MUST); + finalQuery->add(filenameQuery, BooleanClause::MUST); + mainQuery = finalQuery; + } + m_keywords.append(filenameKw); + } + } + // Add path prefix query optimization QStringList searchPathsList = m_options.searchPaths(); - if (mainQuery && SearchUtility::isOcrTextIndexAncestorPathsSupported()) { + if (mainQuery) { QueryPtr pathPrefixQuery = LuceneQueryUtils::buildMultiPathPrefixQuery( searchPathsList, QString::fromWCharArray(LuceneFieldNames::OcrText::kAncestorPaths)); @@ -112,6 +143,34 @@ Lucene::QueryPtr OcrTextIndexedStrategy::buildLuceneQuery(const SearchQuery &que } } + // Add excluded paths filter (pushed down to query layer) + if (mainQuery) { + const QStringList &excludedPaths = m_options.searchExcludedPaths(); + if (!excludedPaths.isEmpty()) { + QueryPtr excludedQuery = LuceneQueryUtils::buildMultiPathPrefixQuery( + excludedPaths, + QString::fromWCharArray(LuceneFieldNames::OcrText::kAncestorPaths)); + if (excludedQuery) { + BooleanQueryPtr finalQuery = newLucene(); + finalQuery->add(mainQuery, BooleanClause::MUST); + finalQuery->add(excludedQuery, BooleanClause::MUST_NOT); + mainQuery = finalQuery; + } + } + } + + // Add hidden file filter (pushed down to query layer) + if (mainQuery && !m_options.includeHidden()) { + QueryPtr hiddenQuery = Lucene::newLucene( + Lucene::newLucene( + LuceneFieldNames::OcrText::kIsHidden, + L"Y")); + BooleanQueryPtr finalQuery = newLucene(); + finalQuery->add(mainQuery, BooleanClause::MUST); + finalQuery->add(hiddenQuery, BooleanClause::MUST_NOT); + mainQuery = finalQuery; + } + // Add time range filter query if (m_options.hasTimeRangeFilter()) { TimeRangeFilter filter = m_options.timeRangeFilter(); @@ -154,38 +213,6 @@ Lucene::QueryPtr OcrTextIndexedStrategy::buildLuceneQuery(const SearchQuery &que } } - // Add filename keyword query - QString filenameKw = optAPI.filenameKeyword(); - if (!filenameKw.isEmpty()) { - Lucene::QueryParserPtr filenameParser = newLucene( - Lucene::LuceneVersion::LUCENE_CURRENT, - LuceneFieldNames::OcrText::kFilename, - analyzer); - Lucene::QueryPtr filenameQuery = filenameParser->parse( - LuceneQueryUtils::processQueryString(filenameKw, false)); - - if (filenameQuery) { - // Check if content keywords are effectively empty - bool noContentKeywords = (query.type() == SearchQuery::Type::Simple) - ? query.keyword().isEmpty() - : (query.subQueries().isEmpty() - || std::all_of(query.subQueries().cbegin(), query.subQueries().cend(), - [](const auto &sq) { return sq.keyword().isEmpty(); })); - - if (noContentKeywords) { - // Filename-only search: use filename query directly - mainQuery = filenameQuery; - } else if (mainQuery) { - // Both content and filename: AND combination - BooleanQueryPtr finalQuery = newLucene(); - finalQuery->add(mainQuery, BooleanClause::MUST); - finalQuery->add(filenameQuery, BooleanClause::MUST); - mainQuery = finalQuery; - } - m_keywords.append(filenameKw); - } - } - return mainQuery; } catch (const Lucene::LuceneException &e) { @@ -288,9 +315,6 @@ void OcrTextIndexedStrategy::processSearchResults(const Lucene::IndexSearcherPtr QElapsedTimer resultTimer; resultTimer.start(); - QString searchPath = m_options.searchPath(); - const QStringList &searchExcludedPaths = m_options.searchExcludedPaths(); - QStringList allSearchPaths = m_options.searchPaths(); auto docsSize = scoreDocs.size(); OcrTextOptionsAPI optAPI(m_options); @@ -298,6 +322,9 @@ void OcrTextIndexedStrategy::processSearchResults(const Lucene::IndexSearcherPtr int previewLen = optAPI.maxPreviewLength() > 0 ? optAPI.maxPreviewLength() : 50; bool enableRetrieval = optAPI.isFullTextRetrievalEnabled(); + // Pre-allocate to avoid reallocation during append + m_results.reserve(m_results.size() + static_cast(docsSize)); + for (int32_t i = 0; i < docsSize; ++i) { if (m_cancelled.load()) { qInfo() << "OCR text search cancelled"; @@ -306,18 +333,11 @@ void OcrTextIndexedStrategy::processSearchResults(const Lucene::IndexSearcherPtr try { Lucene::ScoreDocPtr scoreDoc = scoreDocs[i]; - if (!scoreDoc) { - qWarning() << "Null ScoreDoc encountered at index" << i; - continue; - } - - // Defensive check: verify document ID is valid - if (scoreDoc->doc < 0) { - qWarning() << "Invalid document ID:" << scoreDoc->doc; + if (!scoreDoc || scoreDoc->doc < 0) { + qWarning() << "Invalid ScoreDoc at index" << i; continue; } - // Safely retrieve document (could throw if index is corrupted) Lucene::DocumentPtr doc; try { doc = searcher->doc(scoreDoc->doc); @@ -333,47 +353,14 @@ void OcrTextIndexedStrategy::processSearchResults(const Lucene::IndexSearcherPtr continue; } - // Safely get path - Lucene::String pathField; - try { - pathField = doc->get(LuceneFieldNames::OcrText::kPath); - if (pathField.empty()) { - qWarning() << "Document missing path field at index:" << scoreDoc->doc; - continue; - } - } catch (const std::exception &e) { - qWarning() << "Exception retrieving path field:" << e.what(); - continue; - } - - QString path = QString::fromStdWString(pathField); - - // Check against all search paths - if (!std::any_of(allSearchPaths.cbegin(), allSearchPaths.cend(), - [&path](const auto &sp) { return path.startsWith(sp); })) { - continue; - } - - if (std::any_of(searchExcludedPaths.cbegin(), searchExcludedPaths.cend(), - [&path](const auto &excluded) { return path.startsWith(excluded); })) { + // Path filtering, excluded paths, hidden file — handled at query layer + Lucene::String pathField = doc->get(LuceneFieldNames::OcrText::kPath); + if (pathField.empty()) { + qWarning() << "Document missing path field at index:" << scoreDoc->doc; continue; } - // Safely check hidden status - if (Q_LIKELY(!m_options.includeHidden())) { - try { - Lucene::String hiddenField = doc->get(LuceneFieldNames::OcrText::kIsHidden); - if (!hiddenField.empty() && QString::fromStdWString(hiddenField).toLower() == "y") { - continue; - } - } catch (const std::exception &e) { - qWarning() << "Exception retrieving is_hidden field:" << e.what(); - // Default to visible if field can't be read - } - } - - // Create search result - SearchResult result(path); + SearchResult result(QString::fromStdWString(pathField)); // 设置 OCR 内容结果 OcrTextResultAPI resultApi(result); @@ -381,7 +368,6 @@ void OcrTextIndexedStrategy::processSearchResults(const Lucene::IndexSearcherPtr // 使用ContentHighlighter命名空间进行高亮 if (enableRetrieval) { try { - // Safely get OCR contents with null check Lucene::String ocrContentField = doc->get(LuceneFieldNames::OcrText::kOcrContents); if (!ocrContentField.empty()) { const QString content = QString::fromStdWString(ocrContentField); @@ -394,10 +380,8 @@ void OcrTextIndexedStrategy::processSearchResults(const Lucene::IndexSearcherPtr } } catch (const Lucene::LuceneException &e) { qWarning() << "Exception retrieving OCR content field:" << QString::fromStdWString(e.getError()); - // Continue without content highlight } catch (const std::exception &e) { qWarning() << "Standard exception retrieving OCR content field:" << e.what(); - // Continue without content highlight } } @@ -453,11 +437,11 @@ void OcrTextIndexedStrategy::processSearchResults(const Lucene::IndexSearcherPtr } // Add to result collection - m_results.append(result); + m_results.append(std::move(result)); // Real-time result emission if (Q_UNLIKELY(m_options.resultFoundEnabled())) - emit resultFound(result); + emit resultFound(m_results.last()); } catch (const Lucene::LuceneException &e) { qWarning() << "Error processing result:" << QString::fromStdWString(e.getError()); @@ -504,7 +488,7 @@ void OcrTextIndexedStrategy::performOcrTextSearch(const SearchQuery &query) AnalyzerPtr analyzer = newLucene(); // Build query - m_currentQuery = buildLuceneQuery(query, analyzer, m_options.searchPath()); + m_currentQuery = buildLuceneQuery(query, analyzer); if (!m_currentQuery) { qWarning() << "Failed to build Lucene query for OCR text search"; emit errorOccurred(SearchError(OcrTextSearchErrorCode::OcrTextIndexException)); diff --git a/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.h b/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.h index b09aead1..d1d908a4 100644 --- a/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.h +++ b/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.h @@ -40,7 +40,7 @@ class OcrTextIndexedStrategy : public OcrTextBaseStrategy void performOcrTextSearch(const SearchQuery &query); // Build Lucene query - Lucene::QueryPtr buildLuceneQuery(const SearchQuery &query, const Lucene::AnalyzerPtr &analyzer, const QString &searchPath); + Lucene::QueryPtr buildLuceneQuery(const SearchQuery &query, const Lucene::AnalyzerPtr &analyzer); // Helper for simple queries Lucene::QueryPtr buildSimpleOcrContentsQuery( diff --git a/src/dfm-search/dfm-search-lib/utils/searchutility.cpp b/src/dfm-search/dfm-search-lib/utils/searchutility.cpp index 68186e25..6327f4fb 100644 --- a/src/dfm-search/dfm-search-lib/utils/searchutility.cpp +++ b/src/dfm-search/dfm-search-lib/utils/searchutility.cpp @@ -24,13 +24,6 @@ using namespace Lucene; namespace Global { -// Index version threshold constants -namespace IndexVersionThresholds { -constexpr int FILENAME_ANCESTOR_PATHS = 3; -constexpr int CONTENT_ANCESTOR_PATHS = 1; -constexpr int OCRTEXT_ANCESTOR_PATHS = 1; -} - /** * @brief Read index version from a JSON status file * @param indexDir The index directory path @@ -831,21 +824,6 @@ int ocrTextIndexVersion() namespace SearchUtility { -bool isFilenameIndexAncestorPathsSupported() -{ - return Global::fileNameIndexVersion() > Global::IndexVersionThresholds::FILENAME_ANCESTOR_PATHS; -} - -bool isContentIndexAncestorPathsSupported() -{ - return Global::contentIndexVersion() > Global::IndexVersionThresholds::CONTENT_ANCESTOR_PATHS; -} - -bool isOcrTextIndexAncestorPathsSupported() -{ - return Global::ocrTextIndexVersion() > Global::IndexVersionThresholds::OCRTEXT_ANCESTOR_PATHS; -} - QStringList extractBooleanKeywords(const SearchQuery &query) { QStringList keywords; diff --git a/src/dfm-search/dfm-search-lib/utils/searchutility.h b/src/dfm-search/dfm-search-lib/utils/searchutility.h index 19d177bc..2707b288 100644 --- a/src/dfm-search/dfm-search-lib/utils/searchutility.h +++ b/src/dfm-search/dfm-search-lib/utils/searchutility.h @@ -33,27 +33,6 @@ QStringList extractBooleanKeywords(const SearchQuery &query); */ QStringList deepinAnythingFileTypes(); -/** - * @brief Check if the filename index supports the ancestor_paths field. - * This function checks the filename index version and returns true if the version is greater than 3. - * @return true if the filename index supports ancestor_paths, false otherwise. - */ -bool isFilenameIndexAncestorPathsSupported(); - -/** - * @brief Check if the content index supports the ancestor_paths field. - * This function checks the content index version and returns true if the version is greater than 1. - * @return true if the content index supports ancestor_paths, false otherwise. - */ -bool isContentIndexAncestorPathsSupported(); - -/** - * @brief Check if the OCR text index supports the ancestor_paths field. - * This function checks the OCR text index version and returns true if the version supports ancestor_paths. - * @return true if the OCR text index supports ancestor_paths, false otherwise. - */ -bool isOcrTextIndexAncestorPathsSupported(); - } // namespace SearchUtility DFM_SEARCH_END_NS From 745b093bf893c2eef41ee081580cc3d51ea46612 Mon Sep 17 00:00:00 2001 From: Zhang Sheng Date: Tue, 19 May 2026 13:46:13 +0800 Subject: [PATCH 22/36] feat: add on-demand content highlight retrieval MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Implement ContentRetriever class for fetching highlighted content from Lucene index 2. Add subcommand support in CLI with "highlight" mode 3. Support both single file and batch highlight fetching 4. Implement text and JSON output formats 5. Add configuration options for snippet length and HTML wrapping 6. Handle OCR content and regular content search separately 7. Include error handling and graceful fallbacks Log: 1. Added standalone highlight extraction feature via new ContentRetriever class 2. Added CLI subcommand: "highlight" mode supports fetching snippets without full search 3. Supports both text and machine-readable JSON output formats Influence: 1. Test highlight retrieval with various file types (txt, pdf, images with OCR) 2. Verify CLI highlight subcommand with different combinations of parameters 3. Test boundary cases - empty input, non-existent files, invalid paths 4. Verify JSON output format is valid and complete 5. Test error handling when index is corrupted or unavailable 6. Validate performance with large batches of paths feat: 添加按需内容高亮检索功能 1. 实现 ContentRetriever 类用于从 Lucene 索引获取高亮内容 2. 在 CLI 中添加子命令支持,实现"highlight"模式 3. 支持单文件和批量高亮内容获取 4. 实现文本和 JSON 两种输出格式 5. 添加配置选项用于控制片段长度和 HTML 包裹 6. 区分处理 OCR 内容和常规内容搜索 7. 包含错误处理和优雅降级机制 Log: 1. 新增独立的高亮内容提取功能,通过新增的 ContentRetriever 类实现 2. 新增 CLI 子命令:支持在不执行完整搜索的情况下通过"highlight"模式获取 内容片段 3. 支持文本和机器可读的 JSON 两种输出格式 Influence: 1. 测试不同类型文件的高亮检索功能(txt、pdf、带OCR的图片等) 2. 使用不同参数组合验证 CLI highlight 子命令 3. 测试边界情况-空输入、不存在的文件、无效路径 4. 验证 JSON 输出格式的有效性和完整性 5. 测试索引损坏或不可用时的错误处理 6. 验证大批量路径请求时的性能表现 --- .../dfm-search/dfm-search/contentretriever.h | 76 +++++++ .../dfm-search-client/cli_options.cpp | 60 +++++- .../dfm-search-client/cli_options.h | 5 +- src/dfm-search/dfm-search-client/main.cpp | 59 +++++- .../dfm-search-lib/utils/contentretriever.cpp | 185 ++++++++++++++++++ 5 files changed, 381 insertions(+), 4 deletions(-) create mode 100644 include/dfm-search/dfm-search/contentretriever.h create mode 100644 src/dfm-search/dfm-search-lib/utils/contentretriever.cpp diff --git a/include/dfm-search/dfm-search/contentretriever.h b/include/dfm-search/dfm-search/contentretriever.h new file mode 100644 index 00000000..cd4d4701 --- /dev/null +++ b/include/dfm-search/dfm-search/contentretriever.h @@ -0,0 +1,76 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later +#ifndef CONTENTRETRIEVER_H +#define CONTENTRETRIEVER_H + +#include +#include +#include + +#include + +DFM_SEARCH_BEGIN_NS + +/** + * @brief Lightweight options for highlight extraction + */ +struct HighlightOptions +{ + int maxPreviewLength = 200; ///< Maximum snippet length in characters + bool enableHtml = false; ///< Wrap matched keywords with tags +}; + +/** + * @brief Retrieves highlighted content from Lucene index on demand + * + * Provides a standalone mechanism to fetch highlighted content snippets + * for specific file paths without running a full search pipeline. + * + * Typical usage pattern: + * 1. Perform a search with isFullTextRetrievalEnabled() = false (fast) + * 2. Display path-only results to the user immediately + * 3. On demand (e.g., scroll into view), call fetchHighlight() per path + * + * This decouples highlight extraction from the search pipeline, + * enabling lazy-loading similar to thumbnail fetching. + */ +class ContentRetriever : public QObject +{ + Q_OBJECT + +public: + explicit ContentRetriever(QObject *parent = nullptr); + ~ContentRetriever() override; + + /** + * @brief Synchronously fetch highlighted content for a single file + * + * Opens the Lucene index, locates the document by path, + * extracts stored text, and runs ContentHighlighter to produce + * a highlighted snippet. + * + * @param path Absolute file path + * @param keyword Search keyword (supports comma-separated for multi-keyword) + * @param type SearchType::Content or SearchType::Ocr + * @param options Highlight configuration (preview length, HTML toggle) + * @return Highlighted snippet, or empty string if not found + */ + QString fetchHighlight(const QString &path, + const QString &keyword, + SearchType type, + const HighlightOptions &options = {}) const; + + /** + * @brief Synchronously fetch highlights for multiple files + * @return Mapping of path -> highlighted content (empty string if not found) + */ + QMap fetchHighlights(const QStringList &paths, + const QString &keyword, + SearchType type, + const HighlightOptions &options = {}) const; +}; + +DFM_SEARCH_END_NS + +#endif // CONTENTRETRIEVER_H diff --git a/src/dfm-search/dfm-search-client/cli_options.cpp b/src/dfm-search/dfm-search-client/cli_options.cpp index 38f9e8f7..2c3bcdc0 100644 --- a/src/dfm-search/dfm-search-client/cli_options.cpp +++ b/src/dfm-search/dfm-search-client/cli_options.cpp @@ -59,7 +59,7 @@ void CliOptions::setupOptions() m_parser.setApplicationDescription("DFM Search Client"); m_parser.addHelpOption(); - // 基本选项 + // Basic options m_parser.addOption(m_typeOption); m_parser.addOption(m_methodOption); m_parser.addOption(m_queryOption); @@ -94,7 +94,7 @@ void CliOptions::setupOptions() m_parser.addOption(m_sizeMinOption); m_parser.addOption(m_sizeMaxOption); - // 位置参数 + // Positional arguments m_parser.addPositionalArgument("keyword", "Search keyword"); m_parser.addPositionalArgument("search_path", "Path to search in"); } @@ -176,13 +176,69 @@ void CliOptions::printHelp() const std::cout << std::endl; std::cout << " # Filename search with file size filter (1MB to 100MB)" << std::endl; std::cout << " dfm-searcher --size-min=1M --size-max=100M \"video\" /home/user" << std::endl; + std::cout << std::endl; + std::cout << "Highlight Retrieval (on-demand):" << std::endl; + std::cout << " dfm-searcher highlight --type= [path2 ...] [-j]" << std::endl; + std::cout << " Fetch highlighted content snippets for specific files without running a full search." << std::endl; + std::cout << std::endl; + std::cout << " # Fetch highlight for a single file" << std::endl; + std::cout << " dfm-searcher highlight --type=content \"hello\" /home/user/doc.txt" << std::endl; + std::cout << std::endl; + std::cout << " # Batch fetch highlights with JSON output" << std::endl; + std::cout << " dfm-searcher highlight --type=ocr \"screenshot\" img1.png img2.png -j" << std::endl; } bool CliOptions::parse(QCoreApplication &app, SearchCliConfig &config) { + // Pre-scan for "highlight" subcommand + const QStringList rawArgs = app.arguments(); + if (rawArgs.size() >= 2 && rawArgs.at(1) == "highlight") { + config.subcommand = "highlight"; + } + m_parser.process(app); QStringList positionalArgs = m_parser.positionalArguments(); + + // For highlight subcommand, the positional args are: keyword= + paths= + // "highlight" itself is consumed as first positional by QCommandLineParser + if (config.subcommand == "highlight") { + // Skip "highlight" keyword from positional args (it was parsed as the first positional) + QStringList args = positionalArgs; + if (!args.isEmpty() && args.first() == "highlight") { + args.removeFirst(); + } + if (args.isEmpty()) { + std::cerr << "Error: highlight requires and at least one " << std::endl; + return false; + } + + config.keyword = args.first(); + // Remaining args are file paths + config.searchPath = args.mid(1).join(','); // Reuse searchPath to store comma-separated paths + + // Validate search type for highlight + QString typeStr = m_parser.value(m_typeOption); + if (typeStr == "content") { + config.searchType = SearchType::Content; + } else if (typeStr == "ocr") { + config.searchType = SearchType::Ocr; + } else { + std::cerr << "Error: highlight requires --type=content or --type=ocr" << std::endl; + return false; + } + + config.jsonOutput = m_parser.isSet(m_jsonOption); + if (m_parser.isSet(m_maxPreviewOption)) { + bool ok; + int previewLength = m_parser.value(m_maxPreviewOption).toInt(&ok); + if (ok && previewLength > 0) { + config.maxPreviewLength = previewLength; + } + } + return true; + } + if (positionalArgs.isEmpty()) { printHelp(); return false; diff --git a/src/dfm-search/dfm-search-client/cli_options.h b/src/dfm-search/dfm-search-client/cli_options.h index 6fce5f3f..8e1179c6 100644 --- a/src/dfm-search/dfm-search-client/cli_options.h +++ b/src/dfm-search/dfm-search-client/cli_options.h @@ -23,7 +23,10 @@ namespace dfmsearch { */ struct SearchCliConfig { - // 基本参数 + // Subcommand: if non-empty, the first positional arg is treated as a subcommand + QString subcommand; // "" (search), "highlight" + + // Basic parameters QString keyword; QString searchPath; SearchType searchType = SearchType::FileName; diff --git a/src/dfm-search/dfm-search-client/main.cpp b/src/dfm-search/dfm-search-client/main.cpp index f6d7d4f9..9500f719 100644 --- a/src/dfm-search/dfm-search-client/main.cpp +++ b/src/dfm-search/dfm-search-client/main.cpp @@ -4,6 +4,10 @@ #include #include +#include +#include +#include +#include #include #include @@ -13,11 +17,14 @@ #include #include #include +#include #include "cli_options.h" #include "output/text_output.h" #include "output/json_output.h" +#include + using namespace dfmsearch; /** @@ -155,13 +162,63 @@ int main(int argc, char *argv[]) { QCoreApplication app(argc, argv); - // 解析命令行参数 + // Parse CLI arguments CliOptions cliOptions; SearchCliConfig config; if (!cliOptions.parse(app, config)) { return 1; } + // Highlight subcommand: fetch highlighted content on demand + if (config.subcommand == "highlight") { + DFMSEARCH::ContentRetriever retriever; + DFMSEARCH::HighlightOptions hlOptions; + hlOptions.maxPreviewLength = config.maxPreviewLength; + + // Paths are stored as comma-separated in config.searchPath +#if QT_VERSION >= QT_VERSION_CHECK(5, 15, 0) + QStringList paths = config.searchPath.split(',', Qt::SkipEmptyParts); +#else + QStringList paths = config.searchPath.split(',', QString::SkipEmptyParts); +#endif + + if (config.jsonOutput) { + // JSON output + QJsonObject root; + root["type"] = "highlight"; + root["searchType"] = (config.searchType == SearchType::Content) ? "content" : "ocr"; + root["keyword"] = config.keyword; + + QJsonArray results; + for (const QString &path : paths) { + QJsonObject item; + item["path"] = path; + item["contentMatch"] = retriever.fetchHighlight(path, config.keyword, config.searchType, hlOptions); + results.append(item); + } + + root["totalResults"] = results.size(); + root["results"] = results; + + QJsonDocument doc(root); + std::cout << doc.toJson(QJsonDocument::Indented).toStdString() << std::endl; + } else { + // Text output + QTextStream out(stdout); + for (const QString &path : paths) { + QString hl = retriever.fetchHighlight(path, config.keyword, config.searchType, hlOptions); + out << path << "\n"; + if (!hl.isEmpty()) { + out << " " << hl << "\n"; + } else { + out << " (no match)\n"; + } + out << Qt::endl; + } + } + return 0; + } + // Semantic search mode if (config.semanticMode) { auto *semanticSearcher = new DFMSEARCH::SemanticSearcher(&app); diff --git a/src/dfm-search/dfm-search-lib/utils/contentretriever.cpp b/src/dfm-search/dfm-search-lib/utils/contentretriever.cpp new file mode 100644 index 00000000..38326581 --- /dev/null +++ b/src/dfm-search/dfm-search-lib/utils/contentretriever.cpp @@ -0,0 +1,185 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#include +#include +#include +#include "utils/contenthighlighter.h" + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace Lucene; + +DFM_SEARCH_BEGIN_NS + +namespace { + +/** + * @brief Get the Lucene content field name for the given search type + */ +const wchar_t *contentFieldName(SearchType type) +{ + return (type == SearchType::Ocr) + ? LuceneFieldNames::OcrText::kOcrContents + : LuceneFieldNames::Content::kContents; +} + +/** + * @brief Get the Lucene path field name for the given search type + */ +const wchar_t *pathFieldName(SearchType type) +{ + return (type == SearchType::Ocr) + ? LuceneFieldNames::OcrText::kPath + : LuceneFieldNames::Content::kPath; +} + +/** + * @brief Get the index directory path for the given search type + */ +QString indexDirectoryForType(SearchType type) +{ + return (type == SearchType::Ocr) + ? Global::ocrTextIndexDirectory() + : Global::contentIndexDirectory(); +} + +/** + * @brief Split keyword string into a QStringList suitable for ContentHighlighter + * + * Supports comma-separated multi-keyword input, consistent with + * how the search pipeline handles boolean queries. + */ +QStringList splitKeywords(const QString &keyword) +{ + if (keyword.isEmpty()) return {}; +#if QT_VERSION >= QT_VERSION_CHECK(5, 15, 0) + return keyword.split(',', Qt::SkipEmptyParts); +#else + return keyword.split(',', QString::SkipEmptyParts); +#endif +} + +} // namespace + +ContentRetriever::ContentRetriever(QObject *parent) + : QObject(parent) +{ +} + +ContentRetriever::~ContentRetriever() = default; + +QString ContentRetriever::fetchHighlight(const QString &path, + const QString &keyword, + SearchType type, + const HighlightOptions &options) const +{ + if (path.isEmpty() || keyword.isEmpty()) return {}; + if (type != SearchType::Content && type != SearchType::Ocr) return {}; + + QStringList keywords = splitKeywords(keyword); + if (keywords.isEmpty()) return {}; + + try { + const QString indexDir = indexDirectoryForType(type); + IndexReaderPtr reader = IndexReader::open(FSDirectory::open(indexDir.toStdWString())); + IndexSearcherPtr searcher = newLucene(reader); + + // Build a term query on the path field to find the exact document + TermPtr term = newLucene(pathFieldName(type), path.toStdWString()); + QueryPtr query = newLucene(term); + + TopDocsPtr topDocs = searcher->search(query, 1); + if (!topDocs || topDocs->totalHits == 0) { + return {}; + } + + DocumentPtr doc = searcher->doc(topDocs->scoreDocs[0]->doc); + String contentField = doc->get(contentFieldName(type)); + if (contentField.empty()) { + return {}; + } + + const QString content = QString::fromStdWString(contentField); + return ContentHighlighter::customHighlight( + keywords, content, options.maxPreviewLength, options.enableHtml); + + } catch (const LuceneException &e) { + qWarning() << "ContentRetriever: error fetching highlight for" << path + << QString::fromStdWString(e.getError()); + return {}; + } catch (const std::exception &e) { + qWarning() << "ContentRetriever: std error for" << path << e.what(); + return {}; + } +} + +QMap ContentRetriever::fetchHighlights(const QStringList &paths, + const QString &keyword, + SearchType type, + const HighlightOptions &options) const +{ + QMap results; + if (paths.isEmpty() || keyword.isEmpty()) return results; + if (type != SearchType::Content && type != SearchType::Ocr) return results; + + QStringList keywords = splitKeywords(keyword); + if (keywords.isEmpty()) return results; + + try { + // Open index reader once for all paths + const QString indexDir = indexDirectoryForType(type); + IndexReaderPtr reader = IndexReader::open(FSDirectory::open(indexDir.toStdWString())); + IndexSearcherPtr searcher = newLucene(reader); + + for (const QString &path : paths) { + try { + TermPtr term = newLucene(pathFieldName(type), path.toStdWString()); + QueryPtr query = newLucene(term); + + TopDocsPtr topDocs = searcher->search(query, 1); + if (!topDocs || topDocs->totalHits == 0) { + results.insert(path, {}); + continue; + } + + DocumentPtr doc = searcher->doc(topDocs->scoreDocs[0]->doc); + String contentField = doc->get(contentFieldName(type)); + if (contentField.empty()) { + results.insert(path, {}); + continue; + } + + const QString content = QString::fromStdWString(contentField); + results.insert(path, ContentHighlighter::customHighlight(keywords, content, options.maxPreviewLength, options.enableHtml)); + + } catch (const LuceneException &e) { + qWarning() << "ContentRetriever: error for" << path + << QString::fromStdWString(e.getError()); + results.insert(path, {}); + } catch (const std::exception &e) { + qWarning() << "ContentRetriever: std error for" << path << e.what(); + results.insert(path, {}); + } + } + } catch (const LuceneException &e) { + qWarning() << "ContentRetriever: failed to open index" + << QString::fromStdWString(e.getError()); + } + + return results; +} + +DFM_SEARCH_END_NS From 44356d2e0983debb4b06e39610e88f4092324db6 Mon Sep 17 00:00:00 2001 From: Zhang Sheng Date: Thu, 21 May 2026 09:09:10 +0800 Subject: [PATCH 23/36] refactor: improve NGramTokenizer and search factory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Added normalizeGramSize method to enforce valid n-gram sizes between 1 and kIoBufferSize 2. Added resetState method to consolidate common reset logic 3. Added new reset method with ReaderPtr parameter for better resource management 4. Improved buffer initialization using std::fill_n instead of memset for consistency 5. Added Semantic search type support in SearchFactory 6. Fixed code formatting and alignment in header file 7. Enhanced NGramTokenizer robustness by validating gram sizes upon construction Log: Added Semantic search type support in search factory Influence: 1. Test NGramTokenizer with various min/max gram inputs including edge cases 2. Verify search factory properly handles Semantic search type 3. Ensure tokenizer correctly processes input after reset operations 4. Validate normalization of extreme gram size values 5. Test buffer handling with different input sizes refactor: 改进 NGramTokenizer 和搜索工厂 1. 新增 normalizeGramSize 方法确保 n-gram 大小在 1 至 kIoBufferSize 之间 2. 添加 resetState 方法统一重置逻辑 3. 新增带 ReaderPtr 参数的 reset 方法改进资源管理 4. 使用 std::fill_n 代替 memset 以提高缓冲区初始化一致性 5. 在搜索工厂中添加对 Semantic 搜索类型的支持 6. 修复头文件中的格式和对齐问题 7. 通过在构造函数中验证 gram 大小增强 NGramTokenizer 鲁棒性 Log: 搜索工厂新增支持 Semantic 搜索类型 Influence: 1. 测试 NGramTokenizer 处理各种 min/max gram 输入,包括边界情况 2. 验证搜索工厂正确处理 Semantic 搜索类型 3. 确保分词器在重置操作后正确处理输入 4. 测试极端 gram 大小值的规范化 5. 测试不同输入大小的缓冲区处理 --- .../dfm-search/lucene++/ngramtokenizer.h | 19 +++++++----- .../dfm-search-lib/core/searchfactory.cpp | 1 + .../lucene++/ngramtokenizer.cpp | 30 +++++++++++++++---- 3 files changed, 36 insertions(+), 14 deletions(-) diff --git a/include/dfm-search/dfm-search/lucene++/ngramtokenizer.h b/include/dfm-search/dfm-search/lucene++/ngramtokenizer.h index fa9b57a7..7fecaa11 100644 --- a/include/dfm-search/dfm-search/lucene++/ngramtokenizer.h +++ b/include/dfm-search/dfm-search/lucene++/ngramtokenizer.h @@ -12,8 +12,8 @@ namespace Lucene { class NGramTokenizer : public Tokenizer { public: - NGramTokenizer(const ReaderPtr& input, int32_t minGram, int32_t maxGram); - NGramTokenizer(const AttributeFactoryPtr& factory, const ReaderPtr& input, + NGramTokenizer(const ReaderPtr &input, int32_t minGram, int32_t maxGram); + NGramTokenizer(const AttributeFactoryPtr &factory, const ReaderPtr &input, int32_t minGram, int32_t maxGram); virtual ~NGramTokenizer(); @@ -24,11 +24,14 @@ class NGramTokenizer : public Tokenizer virtual bool incrementToken(); virtual void end(); - using Tokenizer::reset; virtual void reset(); + virtual void reset(const ReaderPtr &input); private: + static int32_t normalizeGramSize(int32_t gramSize); + void init(); + void resetState(); bool fillBuffer(int32_t need); int32_t m_minGram; @@ -41,16 +44,16 @@ class NGramTokenizer : public Tokenizer int32_t m_bufferIndex; bool m_inputExhausted; - int32_t m_offset; // current position in the logical input stream - int32_t m_gramSize; // current n-gram size being emitted + int32_t m_offset; // current position in the logical input stream + int32_t m_gramSize; // current n-gram size being emitted CharArray m_termBuffer; TermAttributePtr m_termAtt; OffsetAttributePtr m_offsetAtt; PositionIncrementAttributePtr m_posIncrAtt; - bool m_isFirstTokenAtPosition; // true = positionIncrement=1, false = 0 (same position) + bool m_isFirstTokenAtPosition; // true = positionIncrement=1, false = 0 (same position) }; -} // namespace Lucene +} // namespace Lucene -#endif // NGRAMTOKENIZER_H \ No newline at end of file +#endif // NGRAMTOKENIZER_H diff --git a/src/dfm-search/dfm-search-lib/core/searchfactory.cpp b/src/dfm-search/dfm-search-lib/core/searchfactory.cpp index 4d092ccf..d266b617 100644 --- a/src/dfm-search/dfm-search-lib/core/searchfactory.cpp +++ b/src/dfm-search/dfm-search-lib/core/searchfactory.cpp @@ -20,6 +20,7 @@ SearchEngine *SearchFactory::createEngine(SearchType type, QObject *parent) case SearchType::Ocr: engine = new SearchEngine(type, parent); break; + case SearchType::Semantic: case SearchType::Custom: // TODO: Created by application based on provider break; diff --git a/src/dfm-search/dfm-search-lib/lucene++/ngramtokenizer.cpp b/src/dfm-search/dfm-search-lib/lucene++/ngramtokenizer.cpp index b48f70ae..73336260 100644 --- a/src/dfm-search/dfm-search-lib/lucene++/ngramtokenizer.cpp +++ b/src/dfm-search/dfm-search-lib/lucene++/ngramtokenizer.cpp @@ -19,8 +19,8 @@ const int32_t NGramTokenizer::kIoBufferSize; NGramTokenizer::NGramTokenizer(const ReaderPtr &input, int32_t minGram, int32_t maxGram) : Tokenizer(input), - m_minGram(std::min(minGram, maxGram)), - m_maxGram(std::max(minGram, maxGram)), + m_minGram(std::min(normalizeGramSize(minGram), normalizeGramSize(maxGram))), + m_maxGram(std::max(normalizeGramSize(minGram), normalizeGramSize(maxGram))), m_isFirstTokenAtPosition(true) { init(); @@ -29,8 +29,8 @@ NGramTokenizer::NGramTokenizer(const ReaderPtr &input, int32_t minGram, int32_t NGramTokenizer::NGramTokenizer(const AttributeFactoryPtr &factory, const ReaderPtr &input, int32_t minGram, int32_t maxGram) : Tokenizer(factory, input), - m_minGram(std::min(minGram, maxGram)), - m_maxGram(std::max(minGram, maxGram)), + m_minGram(std::min(normalizeGramSize(minGram), normalizeGramSize(maxGram))), + m_maxGram(std::max(normalizeGramSize(minGram), normalizeGramSize(maxGram))), m_isFirstTokenAtPosition(true) { init(); @@ -40,21 +40,39 @@ NGramTokenizer::~NGramTokenizer() { } +int32_t NGramTokenizer::normalizeGramSize(int32_t gramSize) +{ + return std::min(std::max(gramSize, 1), kIoBufferSize); +} + void NGramTokenizer::init() { m_ioBuffer = CharArray::newInstance(kIoBufferSize); - memset(m_ioBuffer.get(), 0, kIoBufferSize); + std::fill_n(m_ioBuffer.get(), kIoBufferSize, 0); m_termBuffer = CharArray::newInstance(m_maxGram); - memset(m_termBuffer.get(), 0, m_maxGram); + std::fill_n(m_termBuffer.get(), m_maxGram, 0); m_termAtt = addAttribute(); m_offsetAtt = addAttribute(); m_posIncrAtt = addAttribute(); + + resetState(); } void NGramTokenizer::reset() { Tokenizer::reset(); + resetState(); +} + +void NGramTokenizer::reset(const ReaderPtr &input) +{ + Tokenizer::reset(input); + resetState(); +} + +void NGramTokenizer::resetState() +{ m_bufferIndex = 0; m_ioLen = 0; m_inputExhausted = false; From 2ebb5df06d5c75d845d53dca2db26453ff05e232 Mon Sep 17 00:00:00 2001 From: Zhang Sheng Date: Thu, 21 May 2026 10:33:27 +0800 Subject: [PATCH 24/36] refactor: improve OCR text search validation and analyzer selection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Changed keyword length validation to use unicode character count instead of UTF-8 byte size for more accurate measurement 2. Replaced ChineseAnalyzer with NGramAnalyzer(2,2) for better OCR text fuzzy matching 3. Removed dependency on 3rdparty fulltext/chineseanalyzer.h 4. Added dfm-search/lucene++/ngramanalyzer.h include instead These changes were made because: 1. Measuring text length in characters is more appropriate than bytes for validation 2. NGram analyzer provides better results for OCR text which often contains recognition errors 3. Using built-in analyzer removes external dependency and improves maintainability Log: Improved accuracy of OCR text search results Influence: 1. Test search with various Unicode characters and check validation behavior 2. Verify search results quality with partially recognized OCR text 3. Test with both short and long search queries 4. Check performance impact with new analyzer refactor: 优化OCR文本搜索验证和分析器选择 1. 将关键词长度验证从UTF-8字节数改为Unicode字符数,以获得更准确的测量 2. 用NGramAnalyzer(2,2)替换ChineseAnalyzer实现更好的OCR文本模糊匹配 3. 移除了对3rdparty fulltext/chineseanalyzer.h的依赖 4. 添加了dfm-search/lucene++/ngramanalyzer.h包含 这些修改的原因: 1. 使用字符数而非字节数进行文本长度验证更为恰当 2. NGram分析器能为常含识别错误的OCR文本提供更好的搜索结果 3. 使用内置分析器移除了外部依赖,提升了可维护性 Log: 提升了OCR文本搜索结果的准确性 Influence: 1. 使用各种Unicode字符测试搜索并检查验证行为 2. 验证部分识别OCR文本的搜索结果质量 3. 测试短查询和长查询的结果 4. 检查新分析器对性能的影响 --- .../dfm-search-lib/ocrtextsearch/ocrtextsearchengine.cpp | 2 +- .../ocrtextsearch/ocrtextstrategies/indexedstrategy.cpp | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextsearchengine.cpp b/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextsearchengine.cpp index 476bd8ce..fecc4e76 100644 --- a/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextsearchengine.cpp +++ b/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextsearchengine.cpp @@ -44,7 +44,7 @@ SearchError OcrTextSearchEngine::validateSearchConditions() OcrTextOptionsAPI optAPI(m_options); if (m_currentQuery.type() == SearchQuery::Type::Simple - && m_currentQuery.keyword().toUtf8().size() < Global::kMinContentSearchKeywordLength + && m_currentQuery.keyword().size() < Global::kMinContentSearchKeywordLength && optAPI.filenameKeyword().isEmpty()) { return SearchError(OcrTextSearchErrorCode::KeywordTooShort); } diff --git a/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.cpp b/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.cpp index 705c0063..575a3a30 100644 --- a/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.cpp +++ b/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.cpp @@ -17,8 +17,8 @@ #include #include #include +#include -#include "3rdparty/fulltext/chineseanalyzer.h" #include "utils/cancellablecollector.h" #include "utils/contenthighlighter.h" #include "utils/lucenequeryutils.h" @@ -484,8 +484,8 @@ void OcrTextIndexedStrategy::performOcrTextSearch(const SearchQuery &query) // Create searcher IndexSearcherPtr searcher = newLucene(reader); - // Create analyzer (reuse ChineseAnalyzer for OCR text) - AnalyzerPtr analyzer = newLucene(); + // Create analyzer (use NGram for OCR text fuzzy matching) + AnalyzerPtr analyzer = newLucene(2, 2); // Build query m_currentQuery = buildLuceneQuery(query, analyzer); From 4501c5520e7c68f4d10d8babfffc667c5dfc96bd Mon Sep 17 00:00:00 2001 From: Zhang Sheng Date: Fri, 22 May 2026 17:13:31 +0800 Subject: [PATCH 25/36] perf: optimize search performance with field selector MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Added field selector to only load necessary fields during search operations to reduce disk I/O. 1. Implemented MapFieldSelector to selectively load document fields 2. Contents field is now only loaded when full text retrieval is enabled 3. Detailed metadata fields are conditionally loaded based on detailedResults option 4. Optimized path field loading by making it always included Significantly reduces memory usage and disk access when: 1. Only searching paths without content previews 2. Showing basic results without detailed metadata 3. Operating on large indexes with many documents Influence: 1. Test search performance with detailed results enabled/disabled 2. Verify content preview still works when enabled 3. Check basic path-only searches are faster 4. Verify all detailed metadata fields appear when requested 5. Test with large document collections to confirm reduced memory usage perf: 使用字段选择器优化搜索性能 通过字段选择器只加载搜索所需的字段,减少磁盘I/O操作 1. 实现MapFieldSelector来选择性加载文档字段 2. 内容字段现在只在需要全文检索时加载 3. 详细元数据字段根据detailedResults选项条件性加载 4. 通过始终加载路径字段进行优化 在以下场景显著减少内存使用和磁盘访问: 1. 仅搜索路径不预览内容时 2. 显示基础结果不需要详细元数据时 3. 处理包含大量文档的索引时 Influence: 1. 测试启用/禁用详细结果时的搜索性能 2. 验证内容预览在启用时仍能正常工作 3. 检查仅搜索路径的基本查询是否更快 4. 确保请求时所有详细元数据字段都能显示 5. 使用大型文档集合测试确认内存使用减少 --- .../contentstrategies/indexedstrategy.cpp | 23 +++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.cpp b/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.cpp index 1d6c088f..2fcf7529 100644 --- a/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.cpp +++ b/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.cpp @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -325,6 +326,24 @@ void ContentIndexedStrategy::processSearchResults(const Lucene::IndexSearcherPtr bool enableHTML = optAPI.isSearchResultHighlightEnabled(); int previewLen = optAPI.maxPreviewLength() > 0 ? optAPI.maxPreviewLength() : 50; bool enableRetrieval = optAPI.isFullTextRetrievalEnabled(); + bool detailedResults = m_options.detailedResultsEnabled(); + + // Build field selector to avoid loading the large 'contents' field when not needed. + // The contents field stores full document text and loading it for every result + // (even when only path is needed) causes significant disk I/O overhead. + Lucene::Collection fieldsToLoad = Lucene::Collection::newInstance(); + if (enableRetrieval) { + fieldsToLoad.add(LuceneFieldNames::Content::kContents); + } + fieldsToLoad.add(LuceneFieldNames::Content::kPath); + if (Q_UNLIKELY(detailedResults)) { + fieldsToLoad.add(LuceneFieldNames::Content::kFilename); + fieldsToLoad.add(LuceneFieldNames::Content::kIsHidden); + fieldsToLoad.add(LuceneFieldNames::Content::kModifyTime); + fieldsToLoad.add(LuceneFieldNames::Content::kBirthTime); + fieldsToLoad.add(LuceneFieldNames::Content::kFileSize); + } + Lucene::FieldSelectorPtr fieldSelector = newLucene(fieldsToLoad); // Pre-allocate to avoid reallocation during append m_results.reserve(m_results.size() + static_cast(docsSize)); @@ -344,7 +363,7 @@ void ContentIndexedStrategy::processSearchResults(const Lucene::IndexSearcherPtr Lucene::DocumentPtr doc; try { - doc = searcher->doc(scoreDoc->doc); + doc = searcher->doc(scoreDoc->doc, fieldSelector); if (!doc) { qWarning() << "Failed to retrieve document at index:" << scoreDoc->doc; continue; @@ -383,7 +402,7 @@ void ContentIndexedStrategy::processSearchResults(const Lucene::IndexSearcherPtr } // 设置详细结果(如果启用) - if (Q_UNLIKELY(m_options.detailedResultsEnabled())) { + if (Q_UNLIKELY(detailedResults)) { // 文件名 Lucene::String filenameField = doc->get(LuceneFieldNames::Content::kFilename); if (!filenameField.empty()) { From 4bc956543e16325bc6832bf11c92414dc35ae96f Mon Sep 17 00:00:00 2001 From: Zhang Sheng Date: Fri, 22 May 2026 17:23:00 +0800 Subject: [PATCH 26/36] refactor: disable unit tests in release builds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Modified CMake configuration to automatically disable unit tests in Release and MinSizeRel builds 2. Moved BUILD_UNIT_TESTS option definition to main CMakeLists.txt for better visibility 3. Added safety guard in autotests/CMakeLists.txt to prevent accidental inclusion when tests are disabled 4. Improved build system organization by centralizing test configuration logic Influence: 1. Verify unit tests are excluded from Release/MinSizeRel builds 2. Check Debug/RelWithDebInfo builds still include tests 3. Confirm builds complete successfully in all configurations 4. Test that build system behaves correctly when autotests directory is accessed directly refactor: 在发布版本中禁用单元测试 1. 修改 CMake 配置,在 Release 和 MinSizeRel 构建中自动禁用单元测试 2. 将 BUILD_UNIT_TESTS 选项定义移至主 CMakeLists.txt 以提高可见性 3. 在 autotests/CMakeLists.txt 中添加安全防护,防止测试被禁用时意外包含 4. 通过集中测试配置逻辑改进了构建系统组织 Influence: 1. 验证单元测试是否在 Release/MinSizeRel 构建中被排除 2. 检查 Debug/RelWithDebInfo 构建是否仍包含测试 3. 确认所有配置下的构建都能顺利完成 4. 测试当直接访问 autotests 目录时构建系统的行为是否正确 --- CMakeLists.txt | 13 ++++++++++--- autotests/CMakeLists.txt | 5 ++--- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9ef64ce3..a4d26ddc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -43,9 +43,16 @@ include(GNUInstallDirs) add_subdirectory(${PROJECT_SOURCE_DIR}/src) -# Unit tests (requires Qt Test, enabled by default) -# option(BUILD_UNIT_TESTS is defined in autotests/CMakeLists.txt) -add_subdirectory(${PROJECT_SOURCE_DIR}/autotests) +# Unit tests (enabled by default, except in Release/MinSizeRel builds) +if(CMAKE_BUILD_TYPE STREQUAL "Release" OR CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") + option(BUILD_UNIT_TESTS "Build unit tests" OFF) +else() + option(BUILD_UNIT_TESTS "Build unit tests" ON) +endif() + +if(BUILD_UNIT_TESTS) + add_subdirectory(${PROJECT_SOURCE_DIR}/autotests) +endif() # Legacy tests (temporarily disabled) # add_subdirectory(${PROJECT_SOURCE_DIR}/tests) diff --git a/autotests/CMakeLists.txt b/autotests/CMakeLists.txt index ece3b9bd..d1d46e9f 100644 --- a/autotests/CMakeLists.txt +++ b/autotests/CMakeLists.txt @@ -3,9 +3,8 @@ cmake_minimum_required(VERSION 3.10) project(autotests) -# Enable/disable building of unit tests (default: ON) -option(BUILD_UNIT_TESTS "Build unit tests" ON) - +# BUILD_UNIT_TESTS is defined in top-level CMakeLists.txt; +# this guard acts as a safety net if autotests is somehow included directly if(NOT BUILD_UNIT_TESTS) return() endif() From 6b3cb74c4e5bec431168cef58450515e863770dd Mon Sep 17 00:00:00 2001 From: Zhang Sheng Date: Sat, 23 May 2026 20:32:49 +0800 Subject: [PATCH 27/36] feat: optimize ngram search query building MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The changes refactor how ngram search queries are built to improve performance and simplify the query building process: 1. Added new buildNGramSearchQuery utility function that directly constructs TermQuery and PhraseQuery for ngram searching 2. Removed QueryParser and NGramAnalyzer dependencies from content and OCR search strategies 3. Unified query building logic between content and OCR search 4. Modified keyword length validation to use UTF-8 byte count instead of character count These changes eliminate the need for real-time analysis during searches and provide more precise control over query generation. The implementation specifically handles: - Single and two-character terms as TermQuery - Longer terms as PhraseQuery with proper positions - Case sensitivity handling Log: Optimized ngram search query building for better performance Influence: 1. Test content search with various keyword lengths 2. Verify special character handling in search terms 3. Check case sensitivity behavior 4. Verify mixed filename and content search results 5. Test OCR text search functionality feat: 优化 ngram 搜索查询构建 这些变更重构了 ngram 搜索查询的构建方式以提高性能并简化流程: 1. 新增 buildNGramSearchQuery 工具函数直接构建TermQuery和PhraseQuery 2. 从内容和OCR搜索策略中移除QueryParser和NGramAnalyzer依赖 3. 统一了内容和OCR搜索的查询构建逻辑 4. 修改关键词长度验证改用UTF-8字节计数而非字符计数 这些变更消除了实时分析的需求并提供更精确的查询控制。具体实现了: - 单字符和两字符关键词使用TermQuery - 更长关键词使用带位置的PhraseQuery - 大小写敏感处理 Log: 优化ngram搜索查询构建提升性能 Influence: 1. 测试不同长度关键词的内容搜索 2. 验证搜索词中的特殊字符处理 3. 检查大小写敏感行为 4. 验证混合文件名和内容搜索结果 5. 测试OCR文本搜索功能 --- .../dfm-search-tests/tst_search_utils.cpp | 42 +++++++++++++ .../contentsearch/contentsearchengine.cpp | 2 +- .../contentstrategies/indexedstrategy.cpp | 61 +++++++------------ .../contentstrategies/indexedstrategy.h | 16 ++--- .../ocrtextsearch/ocrtextsearchengine.cpp | 2 +- .../ocrtextstrategies/indexedstrategy.cpp | 61 +++++++------------ .../ocrtextstrategies/indexedstrategy.h | 16 ++--- .../dfm-search-lib/utils/lucenequeryutils.cpp | 47 ++++++++++++++ .../dfm-search-lib/utils/lucenequeryutils.h | 15 +++++ 9 files changed, 160 insertions(+), 102 deletions(-) diff --git a/autotests/dfm-search-tests/tst_search_utils.cpp b/autotests/dfm-search-tests/tst_search_utils.cpp index 11b319e9..44a1ac01 100644 --- a/autotests/dfm-search-tests/tst_search_utils.cpp +++ b/autotests/dfm-search-tests/tst_search_utils.cpp @@ -9,8 +9,13 @@ #include #include +#include +#include +#include + #include #include +#include using namespace DFMSEARCH; @@ -26,6 +31,7 @@ private Q_SLOTS: void testPinyinAcronym(); void testAnythingStatus(); void testFileNameBlacklistMatcher(); + void testNGramSearchQuery(); private: void doTestPinyin(const QString &caseName, const QString &input, bool expected); @@ -331,6 +337,42 @@ void tst_SearchUtils::testGlobal() Q_UNUSED(blacklistPaths); } +void tst_SearchUtils::testNGramSearchQuery() +{ + Lucene::QueryPtr oneCharQuery = LuceneQueryUtils::buildNGramSearchQuery("contents", "A"); + Lucene::TermQueryPtr oneCharTermQuery = boost::dynamic_pointer_cast(oneCharQuery); + QVERIFY(oneCharTermQuery); + QCOMPARE(oneCharTermQuery->getTerm()->field(), Lucene::String(L"contents")); + QCOMPARE(oneCharTermQuery->getTerm()->text(), Lucene::String(L"a")); + + Lucene::QueryPtr twoCharQuery = LuceneQueryUtils::buildNGramSearchQuery("contents", "Ab"); + Lucene::TermQueryPtr twoCharTermQuery = boost::dynamic_pointer_cast(twoCharQuery); + QVERIFY(twoCharTermQuery); + QCOMPARE(twoCharTermQuery->getTerm()->text(), Lucene::String(L"ab")); + + Lucene::QueryPtr evenQuery = LuceneQueryUtils::buildNGramSearchQuery("contents", "abcdef"); + Lucene::PhraseQueryPtr evenPhraseQuery = boost::dynamic_pointer_cast(evenQuery); + QVERIFY(evenPhraseQuery); + QCOMPARE(evenPhraseQuery->getTerms().size(), 3); + QCOMPARE(evenPhraseQuery->getTerms()[0]->text(), Lucene::String(L"ab")); + QCOMPARE(evenPhraseQuery->getTerms()[1]->text(), Lucene::String(L"cd")); + QCOMPARE(evenPhraseQuery->getTerms()[2]->text(), Lucene::String(L"ef")); + QCOMPARE(evenPhraseQuery->getPositions()[0], 0); + QCOMPARE(evenPhraseQuery->getPositions()[1], 2); + QCOMPARE(evenPhraseQuery->getPositions()[2], 4); + + Lucene::QueryPtr oddQuery = LuceneQueryUtils::buildNGramSearchQuery("contents", "abcde"); + Lucene::PhraseQueryPtr oddPhraseQuery = boost::dynamic_pointer_cast(oddQuery); + QVERIFY(oddPhraseQuery); + QCOMPARE(oddPhraseQuery->getTerms().size(), 3); + QCOMPARE(oddPhraseQuery->getTerms()[0]->text(), Lucene::String(L"ab")); + QCOMPARE(oddPhraseQuery->getTerms()[1]->text(), Lucene::String(L"cd")); + QCOMPARE(oddPhraseQuery->getTerms()[2]->text(), Lucene::String(L"de")); + QCOMPARE(oddPhraseQuery->getPositions()[0], 0); + QCOMPARE(oddPhraseQuery->getPositions()[1], 2); + QCOMPARE(oddPhraseQuery->getPositions()[2], 3); +} + QObject *create_tst_SearchUtils() { return new tst_SearchUtils(); diff --git a/src/dfm-search/dfm-search-lib/contentsearch/contentsearchengine.cpp b/src/dfm-search/dfm-search-lib/contentsearch/contentsearchengine.cpp index d992598e..bca9df31 100644 --- a/src/dfm-search/dfm-search-lib/contentsearch/contentsearchengine.cpp +++ b/src/dfm-search/dfm-search-lib/contentsearch/contentsearchengine.cpp @@ -42,7 +42,7 @@ SearchError ContentSearchEngine::validateSearchConditions() } if (m_currentQuery.type() == SearchQuery::Type::Simple - && m_currentQuery.keyword().size() < Global::kMinContentSearchKeywordLength + && m_currentQuery.keyword().toUtf8().size() < Global::kMinContentSearchKeywordLength && api.filenameKeyword().isEmpty()) { return SearchError(ContentSearchErrorCode::KeywordTooShort); } diff --git a/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.cpp b/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.cpp index 2fcf7529..eab4797e 100644 --- a/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.cpp +++ b/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.cpp @@ -11,7 +11,6 @@ #include #include -#include #include #include #include @@ -19,7 +18,6 @@ #include #include -#include #include "utils/cancellablecollector.h" #include "utils/contenthighlighter.h" @@ -64,21 +62,16 @@ void ContentIndexedStrategy::search(const SearchQuery &query) } } -Lucene::QueryPtr ContentIndexedStrategy::buildLuceneQuery(const SearchQuery &query, const Lucene::AnalyzerPtr &analyzer) +Lucene::QueryPtr ContentIndexedStrategy::buildLuceneQuery(const SearchQuery &query) { try { m_keywords.clear(); ContentOptionsAPI optAPI(m_options); // Use the member m_options bool mixedAndEnabled = optAPI.isFilenameContentMixedAndSearchEnabled(); - Lucene::QueryParserPtr contentsParser = newLucene( - Lucene::LuceneVersion::LUCENE_CURRENT, - LuceneFieldNames::Content::kContents, - analyzer); - Lucene::QueryPtr mainQuery; if (query.type() == SearchQuery::Type::Simple) { - mainQuery = buildSimpleContentsQuery(query, contentsParser); + mainQuery = buildSimpleContentsQuery(query); } else if (query.type() == SearchQuery::Type::Boolean) { if (query.subQueries().isEmpty()) { // For an empty boolean query, match nothing. @@ -87,12 +80,12 @@ Lucene::QueryPtr ContentIndexedStrategy::buildLuceneQuery(const SearchQuery &que // Determine which logic path to take for boolean queries if (mixedAndEnabled && query.booleanOperator() == SearchQuery::BooleanOperator::AND) { // New "advanced" AND logic for contents/filename - mainQuery = buildAdvancedAndQuery(query, contentsParser, analyzer); + mainQuery = buildAdvancedAndQuery(query); } else { // "Standard" contents-only logic for: // 1. OR queries (regardless of mixedAndEnabled value). // 2. AND queries when mixedAndEnabled is false. - mainQuery = buildStandardBooleanContentsQuery(query, contentsParser); + mainQuery = buildStandardBooleanContentsQuery(query); } } } else { @@ -103,12 +96,9 @@ Lucene::QueryPtr ContentIndexedStrategy::buildLuceneQuery(const SearchQuery &que // Add filename keyword query (before filters, so it replaces empty content query correctly) QString filenameKw = optAPI.filenameKeyword(); if (!filenameKw.isEmpty()) { - Lucene::QueryParserPtr filenameParser = newLucene( - Lucene::LuceneVersion::LUCENE_CURRENT, - LuceneFieldNames::Content::kFilename, - analyzer); - Lucene::QueryPtr filenameQuery = filenameParser->parse( - LuceneQueryUtils::processQueryString(filenameKw, false)); + Lucene::QueryPtr filenameQuery = LuceneQueryUtils::buildNGramSearchQuery( + QString::fromWCharArray(LuceneFieldNames::Content::kFilename), + filenameKw); if (filenameQuery) { // Check if content keywords are effectively empty @@ -228,15 +218,9 @@ Lucene::QueryPtr ContentIndexedStrategy::buildLuceneQuery(const SearchQuery &que } } -QueryPtr ContentIndexedStrategy::buildAdvancedAndQuery(const SearchQuery &query, const Lucene::QueryParserPtr &contentsParser, const Lucene::AnalyzerPtr &analyzer) +QueryPtr ContentIndexedStrategy::buildAdvancedAndQuery(const SearchQuery &query) { // This method implements the new "mixed" AND logic. - // It requires its own filenameParser. - Lucene::QueryParserPtr filenameParser = newLucene( - Lucene::LuceneVersion::LUCENE_CURRENT, - LuceneFieldNames::Content::kFilename, - analyzer); - Lucene::BooleanQueryPtr overallQuery = newLucene(); Lucene::BooleanQueryPtr mainAndClausesQuery = newLucene(); Lucene::BooleanQueryPtr allContentsQuery = newLucene(); @@ -250,10 +234,12 @@ QueryPtr ContentIndexedStrategy::buildAdvancedAndQuery(const SearchQuery &query, } hasValidKeywords = true; - // 使用 LuceneQueryUtils 处理特殊字符 - Lucene::String processedKeyword = LuceneQueryUtils::processQueryString(subQuery.keyword(), false); - Lucene::QueryPtr contentsTermQuery = contentsParser->parse(processedKeyword); - Lucene::QueryPtr filenameTermQuery = filenameParser->parse(processedKeyword); + Lucene::QueryPtr contentsTermQuery = LuceneQueryUtils::buildNGramSearchQuery( + QString::fromWCharArray(LuceneFieldNames::Content::kContents), + subQuery.keyword()); + Lucene::QueryPtr filenameTermQuery = LuceneQueryUtils::buildNGramSearchQuery( + QString::fromWCharArray(LuceneFieldNames::Content::kFilename), + subQuery.keyword()); // Build (contents:keyword OR filename:keyword) Lucene::BooleanQueryPtr combinedTermQuery = newLucene(); @@ -283,7 +269,7 @@ QueryPtr ContentIndexedStrategy::buildAdvancedAndQuery(const SearchQuery &query, return overallQuery; } -QueryPtr ContentIndexedStrategy::buildStandardBooleanContentsQuery(const SearchQuery &query, const Lucene::QueryParserPtr &contentsParser) +QueryPtr ContentIndexedStrategy::buildStandardBooleanContentsQuery(const SearchQuery &query) { // This method implements the "original" boolean logic, searching only "contents". Lucene::BooleanQueryPtr booleanQuery = newLucene(); @@ -294,8 +280,9 @@ QueryPtr ContentIndexedStrategy::buildStandardBooleanContentsQuery(const SearchQ continue; // Skip empty keywords } - // 使用 LuceneQueryUtils 处理特殊字符 - Lucene::QueryPtr termQuery = contentsParser->parse(LuceneQueryUtils::processQueryString(subQuery.keyword(), false)); + Lucene::QueryPtr termQuery = LuceneQueryUtils::buildNGramSearchQuery( + QString::fromWCharArray(LuceneFieldNames::Content::kContents), + subQuery.keyword()); booleanQuery->add(termQuery, query.booleanOperator() == SearchQuery::BooleanOperator::AND ? Lucene::BooleanClause::MUST : Lucene::BooleanClause::SHOULD); } @@ -303,14 +290,15 @@ QueryPtr ContentIndexedStrategy::buildStandardBooleanContentsQuery(const SearchQ return booleanQuery; } -QueryPtr ContentIndexedStrategy::buildSimpleContentsQuery(const SearchQuery &query, const Lucene::QueryParserPtr &contentsParser) +QueryPtr ContentIndexedStrategy::buildSimpleContentsQuery(const SearchQuery &query) { m_keywords.append(query.keyword()); if (query.keyword().isEmpty()) { return newLucene(); // Match nothing for empty keyword } - // 使用 LuceneQueryUtils 处理特殊字符 - return contentsParser->parse(LuceneQueryUtils::processQueryString(query.keyword(), false)); + return LuceneQueryUtils::buildNGramSearchQuery( + QString::fromWCharArray(LuceneFieldNames::Content::kContents), + query.keyword()); } void ContentIndexedStrategy::processSearchResults(const Lucene::IndexSearcherPtr &searcher, @@ -495,11 +483,8 @@ void ContentIndexedStrategy::performContentSearch(const SearchQuery &query) // 创建搜索器 IndexSearcherPtr searcher = newLucene(reader); - // 创建分析器 - AnalyzerPtr analyzer = newLucene(2, 2); - // 构建查询 - m_currentQuery = buildLuceneQuery(query, analyzer); + m_currentQuery = buildLuceneQuery(query); if (!m_currentQuery) { qWarning() << "Failed to build Lucene query"; emit errorOccurred(SearchError(ContentSearchErrorCode::ContentIndexException)); diff --git a/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.h b/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.h index 39028af6..95e56c76 100644 --- a/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.h +++ b/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.h @@ -7,7 +7,6 @@ #include "basestrategy.h" #include -#include #include #include #include @@ -39,22 +38,15 @@ class ContentIndexedStrategy : public ContentBaseStrategy void performContentSearch(const SearchQuery &query); // Build Lucene query - Lucene::QueryPtr buildLuceneQuery(const SearchQuery &query, const Lucene::AnalyzerPtr &analyzer); + Lucene::QueryPtr buildLuceneQuery(const SearchQuery &query); // Helper for simple queries (original logic for "contents" field) - Lucene::QueryPtr buildSimpleContentsQuery( - const SearchQuery &query, - const Lucene::QueryParserPtr &contentsParser); + Lucene::QueryPtr buildSimpleContentsQuery(const SearchQuery &query); // Helper for "standard" boolean logic (original logic for "contents" field, handles AND/OR) - Lucene::QueryPtr buildStandardBooleanContentsQuery( - const SearchQuery &query, - const Lucene::QueryParserPtr &contentsParser); + Lucene::QueryPtr buildStandardBooleanContentsQuery(const SearchQuery &query); // Helper for "advanced" mixed AND logic (searches "contents" and "filename") - Lucene::QueryPtr buildAdvancedAndQuery( - const SearchQuery &query, // Operator is implicitly AND - const Lucene::QueryParserPtr &contentsParser, - const Lucene::AnalyzerPtr &analyzer); // Analyzer is needed to create filenameParser + Lucene::QueryPtr buildAdvancedAndQuery(const SearchQuery &query); // Operator is implicitly AND // Process search results void processSearchResults(const Lucene::IndexSearcherPtr &searcher, diff --git a/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextsearchengine.cpp b/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextsearchengine.cpp index fecc4e76..476bd8ce 100644 --- a/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextsearchengine.cpp +++ b/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextsearchengine.cpp @@ -44,7 +44,7 @@ SearchError OcrTextSearchEngine::validateSearchConditions() OcrTextOptionsAPI optAPI(m_options); if (m_currentQuery.type() == SearchQuery::Type::Simple - && m_currentQuery.keyword().size() < Global::kMinContentSearchKeywordLength + && m_currentQuery.keyword().toUtf8().size() < Global::kMinContentSearchKeywordLength && optAPI.filenameKeyword().isEmpty()) { return SearchError(OcrTextSearchErrorCode::KeywordTooShort); } diff --git a/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.cpp b/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.cpp index 575a3a30..fd821037 100644 --- a/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.cpp +++ b/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.cpp @@ -9,7 +9,6 @@ #include #include -#include #include #include #include @@ -17,7 +16,6 @@ #include #include #include -#include #include "utils/cancellablecollector.h" #include "utils/contenthighlighter.h" @@ -62,21 +60,16 @@ void OcrTextIndexedStrategy::search(const SearchQuery &query) } } -Lucene::QueryPtr OcrTextIndexedStrategy::buildLuceneQuery(const SearchQuery &query, const Lucene::AnalyzerPtr &analyzer) +Lucene::QueryPtr OcrTextIndexedStrategy::buildLuceneQuery(const SearchQuery &query) { try { m_keywords.clear(); OcrTextOptionsAPI optAPI(m_options); bool mixedAndEnabled = optAPI.isFilenameOcrContentMixedAndSearchEnabled(); - Lucene::QueryParserPtr ocrContentsParser = newLucene( - Lucene::LuceneVersion::LUCENE_CURRENT, - LuceneFieldNames::OcrText::kOcrContents, - analyzer); - Lucene::QueryPtr mainQuery; if (query.type() == SearchQuery::Type::Simple) { - mainQuery = buildSimpleOcrContentsQuery(query, ocrContentsParser); + mainQuery = buildSimpleOcrContentsQuery(query); } else if (query.type() == SearchQuery::Type::Boolean) { if (query.subQueries().isEmpty()) { // For an empty boolean query, match nothing. @@ -85,10 +78,10 @@ Lucene::QueryPtr OcrTextIndexedStrategy::buildLuceneQuery(const SearchQuery &que // Determine which logic path to take for boolean queries if (mixedAndEnabled && query.booleanOperator() == SearchQuery::BooleanOperator::AND) { // New "advanced" AND logic for ocr_contents/filename - mainQuery = buildAdvancedAndQuery(query, ocrContentsParser, analyzer); + mainQuery = buildAdvancedAndQuery(query); } else { // "Standard" ocr_contents-only logic - mainQuery = buildStandardBooleanOcrContentsQuery(query, ocrContentsParser); + mainQuery = buildStandardBooleanOcrContentsQuery(query); } } } else { @@ -99,12 +92,9 @@ Lucene::QueryPtr OcrTextIndexedStrategy::buildLuceneQuery(const SearchQuery &que // Add filename keyword query (before filters, so it replaces empty content query correctly) QString filenameKw = optAPI.filenameKeyword(); if (!filenameKw.isEmpty()) { - Lucene::QueryParserPtr filenameParser = newLucene( - Lucene::LuceneVersion::LUCENE_CURRENT, - LuceneFieldNames::OcrText::kFilename, - analyzer); - Lucene::QueryPtr filenameQuery = filenameParser->parse( - LuceneQueryUtils::processQueryString(filenameKw, false)); + Lucene::QueryPtr filenameQuery = LuceneQueryUtils::buildNGramSearchQuery( + QString::fromWCharArray(LuceneFieldNames::OcrText::kFilename), + filenameKw); if (filenameQuery) { // Check if content keywords are effectively empty @@ -224,15 +214,9 @@ Lucene::QueryPtr OcrTextIndexedStrategy::buildLuceneQuery(const SearchQuery &que } } -QueryPtr OcrTextIndexedStrategy::buildAdvancedAndQuery(const SearchQuery &query, const Lucene::QueryParserPtr &ocrContentsParser, const Lucene::AnalyzerPtr &analyzer) +QueryPtr OcrTextIndexedStrategy::buildAdvancedAndQuery(const SearchQuery &query) { // This method implements the "mixed" AND logic similar to content search. - // It requires its own filenameParser. - Lucene::QueryParserPtr filenameParser = newLucene( - Lucene::LuceneVersion::LUCENE_CURRENT, - LuceneFieldNames::OcrText::kFilename, - analyzer); - Lucene::BooleanQueryPtr overallQuery = newLucene(); Lucene::BooleanQueryPtr mainAndClausesQuery = newLucene(); Lucene::BooleanQueryPtr allOcrContentsQuery = newLucene(); @@ -246,10 +230,12 @@ QueryPtr OcrTextIndexedStrategy::buildAdvancedAndQuery(const SearchQuery &query, } hasValidKeywords = true; - // Use LuceneQueryUtils to process special characters - Lucene::String processedKeyword = LuceneQueryUtils::processQueryString(subQuery.keyword(), false); - Lucene::QueryPtr ocrContentsTermQuery = ocrContentsParser->parse(processedKeyword); - Lucene::QueryPtr filenameTermQuery = filenameParser->parse(processedKeyword); + Lucene::QueryPtr ocrContentsTermQuery = LuceneQueryUtils::buildNGramSearchQuery( + QString::fromWCharArray(LuceneFieldNames::OcrText::kOcrContents), + subQuery.keyword()); + Lucene::QueryPtr filenameTermQuery = LuceneQueryUtils::buildNGramSearchQuery( + QString::fromWCharArray(LuceneFieldNames::OcrText::kFilename), + subQuery.keyword()); // Build (ocr_contents:keyword OR filename:keyword) Lucene::BooleanQueryPtr combinedTermQuery = newLucene(); @@ -278,7 +264,7 @@ QueryPtr OcrTextIndexedStrategy::buildAdvancedAndQuery(const SearchQuery &query, return overallQuery; } -QueryPtr OcrTextIndexedStrategy::buildStandardBooleanOcrContentsQuery(const SearchQuery &query, const Lucene::QueryParserPtr &ocrContentsParser) +QueryPtr OcrTextIndexedStrategy::buildStandardBooleanOcrContentsQuery(const SearchQuery &query) { // This method implements the "original" boolean logic, searching only "ocr_contents". Lucene::BooleanQueryPtr booleanQuery = newLucene(); @@ -289,8 +275,9 @@ QueryPtr OcrTextIndexedStrategy::buildStandardBooleanOcrContentsQuery(const Sear continue; // Skip empty keywords } - // Use LuceneQueryUtils to process special characters - Lucene::QueryPtr termQuery = ocrContentsParser->parse(LuceneQueryUtils::processQueryString(subQuery.keyword(), false)); + Lucene::QueryPtr termQuery = LuceneQueryUtils::buildNGramSearchQuery( + QString::fromWCharArray(LuceneFieldNames::OcrText::kOcrContents), + subQuery.keyword()); booleanQuery->add(termQuery, query.booleanOperator() == SearchQuery::BooleanOperator::AND ? Lucene::BooleanClause::MUST : Lucene::BooleanClause::SHOULD); } @@ -298,14 +285,15 @@ QueryPtr OcrTextIndexedStrategy::buildStandardBooleanOcrContentsQuery(const Sear return booleanQuery; } -QueryPtr OcrTextIndexedStrategy::buildSimpleOcrContentsQuery(const SearchQuery &query, const Lucene::QueryParserPtr &ocrContentsParser) +QueryPtr OcrTextIndexedStrategy::buildSimpleOcrContentsQuery(const SearchQuery &query) { m_keywords.append(query.keyword()); if (query.keyword().isEmpty()) { return newLucene(); // Match nothing for empty keyword } - // Use LuceneQueryUtils to process special characters - return ocrContentsParser->parse(LuceneQueryUtils::processQueryString(query.keyword(), false)); + return LuceneQueryUtils::buildNGramSearchQuery( + QString::fromWCharArray(LuceneFieldNames::OcrText::kOcrContents), + query.keyword()); } void OcrTextIndexedStrategy::processSearchResults(const Lucene::IndexSearcherPtr &searcher, @@ -484,11 +472,8 @@ void OcrTextIndexedStrategy::performOcrTextSearch(const SearchQuery &query) // Create searcher IndexSearcherPtr searcher = newLucene(reader); - // Create analyzer (use NGram for OCR text fuzzy matching) - AnalyzerPtr analyzer = newLucene(2, 2); - // Build query - m_currentQuery = buildLuceneQuery(query, analyzer); + m_currentQuery = buildLuceneQuery(query); if (!m_currentQuery) { qWarning() << "Failed to build Lucene query for OCR text search"; emit errorOccurred(SearchError(OcrTextSearchErrorCode::OcrTextIndexException)); diff --git a/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.h b/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.h index d1d908a4..d75ef469 100644 --- a/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.h +++ b/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.h @@ -7,7 +7,6 @@ #include "basestrategy.h" #include -#include #include #include #include @@ -40,23 +39,16 @@ class OcrTextIndexedStrategy : public OcrTextBaseStrategy void performOcrTextSearch(const SearchQuery &query); // Build Lucene query - Lucene::QueryPtr buildLuceneQuery(const SearchQuery &query, const Lucene::AnalyzerPtr &analyzer); + Lucene::QueryPtr buildLuceneQuery(const SearchQuery &query); // Helper for simple queries - Lucene::QueryPtr buildSimpleOcrContentsQuery( - const SearchQuery &query, - const Lucene::QueryParserPtr &ocrContentsParser); + Lucene::QueryPtr buildSimpleOcrContentsQuery(const SearchQuery &query); // Helper for "standard" boolean logic - Lucene::QueryPtr buildStandardBooleanOcrContentsQuery( - const SearchQuery &query, - const Lucene::QueryParserPtr &ocrContentsParser); + Lucene::QueryPtr buildStandardBooleanOcrContentsQuery(const SearchQuery &query); // Helper for "advanced" mixed AND logic (searches "ocr_contents" and "filename") - Lucene::QueryPtr buildAdvancedAndQuery( - const SearchQuery &query, - const Lucene::QueryParserPtr &ocrContentsParser, - const Lucene::AnalyzerPtr &analyzer); + Lucene::QueryPtr buildAdvancedAndQuery(const SearchQuery &query); // Process search results void processSearchResults(const Lucene::IndexSearcherPtr &searcher, diff --git a/src/dfm-search/dfm-search-lib/utils/lucenequeryutils.cpp b/src/dfm-search/dfm-search-lib/utils/lucenequeryutils.cpp index 603fad88..14832eb3 100644 --- a/src/dfm-search/dfm-search-lib/utils/lucenequeryutils.cpp +++ b/src/dfm-search/dfm-search-lib/utils/lucenequeryutils.cpp @@ -6,10 +6,33 @@ #include #include +#include +#include DFM_SEARCH_BEGIN_NS namespace LuceneQueryUtils { +namespace { + +Lucene::String toLuceneString(const QString &str, bool caseSensitive) +{ + QString normalized = caseSensitive ? str : str.toLower(); + QByteArray utf8Bytes = normalized.toUtf8(); + Lucene::String luceneStr = Lucene::StringUtils::toUnicode(std::string(utf8Bytes.constData(), utf8Bytes.length())); + if (luceneStr.empty()) { + luceneStr = Lucene::StringUtils::toUnicode(normalized.toStdString()); + } + return luceneStr; +} + +Lucene::TermPtr buildTerm(const QString &fieldName, const QString &text, bool caseSensitive) +{ + return Lucene::newLucene( + toLuceneString(fieldName, true), + toLuceneString(text, caseSensitive)); +} + +} // namespace std::wstring getLuceneSpecialChars() { @@ -52,6 +75,30 @@ Lucene::String processQueryString(const QString &str, bool caseSensitive) return luceneStr; } +Lucene::QueryPtr buildNGramSearchQuery(const QString &fieldName, const QString &keyword, bool caseSensitive) +{ + if (fieldName.isEmpty() || keyword.isEmpty()) { + return nullptr; + } + + if (keyword.size() <= 2) { + return Lucene::newLucene( + buildTerm(fieldName, keyword, caseSensitive)); + } + + Lucene::PhraseQueryPtr phraseQuery = Lucene::newLucene(); + for (int pos = 0; pos + 2 <= keyword.size(); pos += 2) { + phraseQuery->add(buildTerm(fieldName, keyword.mid(pos, 2), caseSensitive), pos); + } + + if (keyword.size() % 2 != 0) { + const int tailPos = keyword.size() - 2; + phraseQuery->add(buildTerm(fieldName, keyword.mid(tailPos, 2), caseSensitive), tailPos); + } + + return phraseQuery; +} + Lucene::QueryPtr buildPathPrefixQuery(const QString &pathPrefix, const QString &fieldName) { if (pathPrefix.isEmpty() || fieldName.isEmpty()) { diff --git a/src/dfm-search/dfm-search-lib/utils/lucenequeryutils.h b/src/dfm-search/dfm-search-lib/utils/lucenequeryutils.h index c7d1c834..29c96b5a 100644 --- a/src/dfm-search/dfm-search-lib/utils/lucenequeryutils.h +++ b/src/dfm-search/dfm-search-lib/utils/lucenequeryutils.h @@ -25,6 +25,21 @@ namespace LuceneQueryUtils { */ Lucene::String processQueryString(const QString &str, bool caseSensitive = false); +/** + * @brief Build a query that matches text indexed by NGramAnalyzer(1, 2) + * + * The query is built directly instead of passing user input through an n-gram + * analyzer at search time. One- and two-character keywords use TermQuery. + * Longer keywords use a sparse PhraseQuery over 2-grams to avoid generating + * every overlapping query term. + * + * @param fieldName The indexed field name + * @param keyword The raw user keyword + * @param caseSensitive Whether the search is case sensitive + * @return Lucene query object, or nullptr if fieldName or keyword is empty + */ +Lucene::QueryPtr buildNGramSearchQuery(const QString &fieldName, const QString &keyword, bool caseSensitive = false); + /** * @brief Get a list of Lucene special characters that need escaping * @return List of special characters From 9c477b97ad7b11a1ac86345a67cc7ce56908948d Mon Sep 17 00:00:00 2001 From: Zhang Sheng Date: Sun, 24 May 2026 14:56:02 +0800 Subject: [PATCH 28/36] refactor: remove NGram analyzer and tokenizer components MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Removed the NGram analyzer and tokenizer implementation from the Lucene+ + integration. These components were responsible for generating n- gram tokens from input text but are no longer needed in the search functionality. The removal includes: 1. NGramAnalyzer header and implementation 2. NGramTokenizer header and implementation 3. All associated utility functions and constants This cleanup is part of ongoing efforts to simplify the Lucene++ integration and remove unused components. The n-gram token generation functionality wasn't being utilized in the current search implementation and was complicating the codebase. Influence: Testing should verify that basic search functionality still works as expected, particularly: 1. File content searching 2. File name searching 3. Special character handling in searches 4. Various query types (exact match, wildcard, etc.) refactor: 移除NGram分析器和分词器组件 从Lucene++集成中移除了NGram分析器和分词器实现。这些组件原本负责从输入文 本生成n-gram令牌,但在当前搜索功能中已不再需要。此次移除包括: 1. NGramAnalyzer头文件和实现文件 2. NGramTokenizer头文件和实现文件 3. 所有相关的工具函数和常量 此次清理是简化Lucene++集成和移除未使用组件工作的一部分。n-gram令牌生成功 能在当前搜索实现中并未使用,并且使代码库变得复杂。 Influence: 测试应验证基本搜索功能仍按预期工作,特别是: 1. 文件内容搜索 2. 文件名搜索 3. 搜索中的特殊字符处理 4. 各种查询类型(精确匹配、通配符等) --- .../dfm-search/lucene++/ngramanalyzer.h | 31 ---- .../dfm-search/lucene++/ngramtokenizer.h | 59 ------ .../dfm-search-lib/lucene++/ngramanalyzer.cpp | 37 ---- .../lucene++/ngramtokenizer.cpp | 169 ------------------ 4 files changed, 296 deletions(-) delete mode 100644 include/dfm-search/dfm-search/lucene++/ngramanalyzer.h delete mode 100644 include/dfm-search/dfm-search/lucene++/ngramtokenizer.h delete mode 100644 src/dfm-search/dfm-search-lib/lucene++/ngramanalyzer.cpp delete mode 100644 src/dfm-search/dfm-search-lib/lucene++/ngramtokenizer.cpp diff --git a/include/dfm-search/dfm-search/lucene++/ngramanalyzer.h b/include/dfm-search/dfm-search/lucene++/ngramanalyzer.h deleted file mode 100644 index 08ca8436..00000000 --- a/include/dfm-search/dfm-search/lucene++/ngramanalyzer.h +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. -// -// SPDX-License-Identifier: GPL-3.0-or-later - -#ifndef NGRAMANALYZER_H -#define NGRAMANALYZER_H - -#include - -namespace Lucene { - -class NGramAnalyzer : public Analyzer -{ -public: - explicit NGramAnalyzer(int32_t minGram, int32_t maxGram); - virtual ~NGramAnalyzer(); - - LUCENE_CLASS(NGramAnalyzer); - -public: - virtual TokenStreamPtr tokenStream(const String &fieldName, const ReaderPtr &reader); - virtual TokenStreamPtr reusableTokenStream(const String &fieldName, const ReaderPtr &reader); - -private: - int32_t m_minGram; - int32_t m_maxGram; -}; - -} // namespace Lucene - -#endif // NGRAMANALYZER_H diff --git a/include/dfm-search/dfm-search/lucene++/ngramtokenizer.h b/include/dfm-search/dfm-search/lucene++/ngramtokenizer.h deleted file mode 100644 index 7fecaa11..00000000 --- a/include/dfm-search/dfm-search/lucene++/ngramtokenizer.h +++ /dev/null @@ -1,59 +0,0 @@ -// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. -// -// SPDX-License-Identifier: GPL-3.0-or-later - -#ifndef NGRAMTOKENIZER_H -#define NGRAMTOKENIZER_H - -#include - -namespace Lucene { - -class NGramTokenizer : public Tokenizer -{ -public: - NGramTokenizer(const ReaderPtr &input, int32_t minGram, int32_t maxGram); - NGramTokenizer(const AttributeFactoryPtr &factory, const ReaderPtr &input, - int32_t minGram, int32_t maxGram); - - virtual ~NGramTokenizer(); - - LUCENE_CLASS(NGramTokenizer); - -public: - virtual bool incrementToken(); - virtual void end(); - - virtual void reset(); - virtual void reset(const ReaderPtr &input); - -private: - static int32_t normalizeGramSize(int32_t gramSize); - - void init(); - void resetState(); - bool fillBuffer(int32_t need); - - int32_t m_minGram; - int32_t m_maxGram; - - static const int32_t kIoBufferSize = 1024; - - CharArray m_ioBuffer; - int32_t m_ioLen; - int32_t m_bufferIndex; - bool m_inputExhausted; - - int32_t m_offset; // current position in the logical input stream - int32_t m_gramSize; // current n-gram size being emitted - CharArray m_termBuffer; - - TermAttributePtr m_termAtt; - OffsetAttributePtr m_offsetAtt; - PositionIncrementAttributePtr m_posIncrAtt; - bool m_isFirstTokenAtPosition; // true = positionIncrement=1, false = 0 (same position) -}; - -} // namespace Lucene - -#endif // NGRAMTOKENIZER_H diff --git a/src/dfm-search/dfm-search-lib/lucene++/ngramanalyzer.cpp b/src/dfm-search/dfm-search-lib/lucene++/ngramanalyzer.cpp deleted file mode 100644 index e21f9fe3..00000000 --- a/src/dfm-search/dfm-search-lib/lucene++/ngramanalyzer.cpp +++ /dev/null @@ -1,37 +0,0 @@ -// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. -// -// SPDX-License-Identifier: GPL-3.0-or-later - -#include -#include - -namespace Lucene { - -NGramAnalyzer::NGramAnalyzer(int32_t minGram, int32_t maxGram) - : m_minGram(minGram), m_maxGram(maxGram) -{ -} - -NGramAnalyzer::~NGramAnalyzer() -{ -} - -TokenStreamPtr NGramAnalyzer::tokenStream(const String &fieldName, const ReaderPtr &reader) -{ - return newLucene(reader, m_minGram, m_maxGram); -} - -TokenStreamPtr NGramAnalyzer::reusableTokenStream(const String &fieldName, const ReaderPtr &reader) -{ - LuceneObjectPtr prev = getPreviousTokenStream(); - TokenizerPtr saved(boost::dynamic_pointer_cast(prev)); - if (!saved) { - saved = newLucene(reader, m_minGram, m_maxGram); - setPreviousTokenStream(saved); - } else { - saved->reset(reader); - } - return saved; -} - -} // namespace Lucene diff --git a/src/dfm-search/dfm-search-lib/lucene++/ngramtokenizer.cpp b/src/dfm-search/dfm-search-lib/lucene++/ngramtokenizer.cpp deleted file mode 100644 index 73336260..00000000 --- a/src/dfm-search/dfm-search-lib/lucene++/ngramtokenizer.cpp +++ /dev/null @@ -1,169 +0,0 @@ -// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. -// -// SPDX-License-Identifier: GPL-3.0-or-later - -#include - -#include - -#include -#include -#include -#include -#include -#include - -namespace Lucene { - -const int32_t NGramTokenizer::kIoBufferSize; - -NGramTokenizer::NGramTokenizer(const ReaderPtr &input, int32_t minGram, int32_t maxGram) - : Tokenizer(input), - m_minGram(std::min(normalizeGramSize(minGram), normalizeGramSize(maxGram))), - m_maxGram(std::max(normalizeGramSize(minGram), normalizeGramSize(maxGram))), - m_isFirstTokenAtPosition(true) -{ - init(); -} - -NGramTokenizer::NGramTokenizer(const AttributeFactoryPtr &factory, const ReaderPtr &input, - int32_t minGram, int32_t maxGram) - : Tokenizer(factory, input), - m_minGram(std::min(normalizeGramSize(minGram), normalizeGramSize(maxGram))), - m_maxGram(std::max(normalizeGramSize(minGram), normalizeGramSize(maxGram))), - m_isFirstTokenAtPosition(true) -{ - init(); -} - -NGramTokenizer::~NGramTokenizer() -{ -} - -int32_t NGramTokenizer::normalizeGramSize(int32_t gramSize) -{ - return std::min(std::max(gramSize, 1), kIoBufferSize); -} - -void NGramTokenizer::init() -{ - m_ioBuffer = CharArray::newInstance(kIoBufferSize); - std::fill_n(m_ioBuffer.get(), kIoBufferSize, 0); - m_termBuffer = CharArray::newInstance(m_maxGram); - std::fill_n(m_termBuffer.get(), m_maxGram, 0); - - m_termAtt = addAttribute(); - m_offsetAtt = addAttribute(); - m_posIncrAtt = addAttribute(); - - resetState(); -} - -void NGramTokenizer::reset() -{ - Tokenizer::reset(); - resetState(); -} - -void NGramTokenizer::reset(const ReaderPtr &input) -{ - Tokenizer::reset(input); - resetState(); -} - -void NGramTokenizer::resetState() -{ - m_bufferIndex = 0; - m_ioLen = 0; - m_inputExhausted = false; - m_offset = 0; - m_gramSize = m_minGram; - m_isFirstTokenAtPosition = true; -} - -// Ensure at least 'need' chars are available starting from m_bufferIndex -bool NGramTokenizer::fillBuffer(int32_t need) -{ - if (need <= 0) - return true; - - int32_t available = m_ioLen - m_bufferIndex; - if (available >= need) - return true; - - if (m_inputExhausted) - return false; - - // Compact: shift unread data to front of buffer - if (m_bufferIndex > 0) { - int32_t remaining = m_ioLen - m_bufferIndex; - if (remaining > 0) - memmove(m_ioBuffer.get(), m_ioBuffer.get() + m_bufferIndex, remaining * sizeof(wchar_t)); - m_ioLen = remaining; - m_bufferIndex = 0; - } - - // Read more from input - while (m_ioLen < kIoBufferSize) { - int32_t read = input->read(m_ioBuffer.get(), m_ioLen, kIoBufferSize - m_ioLen); - if (read == -1) { - m_inputExhausted = true; - break; - } - m_ioLen += read; - if (m_ioLen - m_bufferIndex >= need) - return true; - } - - return (m_ioLen - m_bufferIndex) >= need; -} - -bool NGramTokenizer::incrementToken() -{ - clearAttributes(); - - while (true) { - // Need m_gramSize chars starting from current buffer position - if (!fillBuffer(m_gramSize)) { - // Not enough chars left — advance to next offset, reset gram size - m_bufferIndex++; - m_offset++; - m_gramSize = m_minGram; - m_isFirstTokenAtPosition = true; - if (!fillBuffer(m_minGram)) - return false; - continue; - } - - int32_t start = m_offset; - - // Emit the n-gram at current buffer position with current gram size - for (int32_t i = 0; i < m_gramSize; ++i) { - m_termBuffer[i] = CharFolder::toLower(m_ioBuffer[m_bufferIndex + i]); - } - m_termAtt->setTermBuffer(m_termBuffer.get(), 0, m_gramSize); - m_offsetAtt->setOffset(correctOffset(start), correctOffset(start + m_gramSize)); - m_posIncrAtt->setPositionIncrement(m_isFirstTokenAtPosition ? 1 : 0); - - // Cycle gram size: 2 → 3 → 4, then advance offset - m_gramSize++; - if (m_gramSize > m_maxGram) { - m_gramSize = m_minGram; - m_bufferIndex++; - m_offset++; - m_isFirstTokenAtPosition = true; - } else { - m_isFirstTokenAtPosition = false; - } - - return true; - } -} - -void NGramTokenizer::end() -{ - int32_t finalOffset = correctOffset(m_offset); - m_offsetAtt->setOffset(finalOffset, finalOffset); -} - -} // namespace Lucene From e789187f5a3d2a0afef59dac3f638df7dd648b8c Mon Sep 17 00:00:00 2001 From: Zhang Sheng Date: Sun, 24 May 2026 17:22:01 +0800 Subject: [PATCH 29/36] fix: adjust N-gram token position calculation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The change fixes the position calculation in N-gram search queries to properly align with Lucene++'s NGramTokenizer behavior. The tokenizer emits 1-gram and 2-gram tokens at each character offset, advancing position by 1 for each token emitted. The new position calculation uses the formula 2*i+1 where i is the start offset to accurately reflect token positions. 1. Added helper function phrasePositionForStandardNGram2 to calculate proper positions 2. Updated test cases to expect new position values (1,5,9 for even length, 1,5,7 for odd) 3. Modified query building logic to use new position calculation Influence: 1. Test N-gram search functionality with various input lengths 2. Verify search accuracy with different character combinations 3. Check position-dependent search operations 4. Validate edge cases with very short/long search terms fix: 调整N元语法标记位置计算 本次修改修正了N元语法搜索查询中的位置计算,使之与Lucene++的 NGramTokenizer行为正确对齐。该分词器在每个字符偏移量处发射1元和2元标记, 并为每个发射的标记将位置前进1。新的位置计算使用公式2*i+1(i为起始偏移 量)来准确反映标记位置。 1. 添加辅助函数phrasePositionForStandardNGram2来计算正确位置 2. 更新测试用例以期望新的位置值(偶数长度为1,5,9,奇数长度为1,5,7) 3. 修改查询构建逻辑以使用新的位置计算 Influence: 1. 测试不同输入长度的N元语法搜索功能 2. 验证不同字符组合的搜索准确性 3. 检查依赖位置的搜索操作 4. 验证非常短/长的搜索词边缘情况 --- autotests/dfm-search-tests/tst_search_utils.cpp | 12 ++++++------ .../dfm-search-lib/utils/lucenequeryutils.cpp | 14 ++++++++++++-- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/autotests/dfm-search-tests/tst_search_utils.cpp b/autotests/dfm-search-tests/tst_search_utils.cpp index 44a1ac01..c962d449 100644 --- a/autotests/dfm-search-tests/tst_search_utils.cpp +++ b/autotests/dfm-search-tests/tst_search_utils.cpp @@ -357,9 +357,9 @@ void tst_SearchUtils::testNGramSearchQuery() QCOMPARE(evenPhraseQuery->getTerms()[0]->text(), Lucene::String(L"ab")); QCOMPARE(evenPhraseQuery->getTerms()[1]->text(), Lucene::String(L"cd")); QCOMPARE(evenPhraseQuery->getTerms()[2]->text(), Lucene::String(L"ef")); - QCOMPARE(evenPhraseQuery->getPositions()[0], 0); - QCOMPARE(evenPhraseQuery->getPositions()[1], 2); - QCOMPARE(evenPhraseQuery->getPositions()[2], 4); + QCOMPARE(evenPhraseQuery->getPositions()[0], 1); + QCOMPARE(evenPhraseQuery->getPositions()[1], 5); + QCOMPARE(evenPhraseQuery->getPositions()[2], 9); Lucene::QueryPtr oddQuery = LuceneQueryUtils::buildNGramSearchQuery("contents", "abcde"); Lucene::PhraseQueryPtr oddPhraseQuery = boost::dynamic_pointer_cast(oddQuery); @@ -368,9 +368,9 @@ void tst_SearchUtils::testNGramSearchQuery() QCOMPARE(oddPhraseQuery->getTerms()[0]->text(), Lucene::String(L"ab")); QCOMPARE(oddPhraseQuery->getTerms()[1]->text(), Lucene::String(L"cd")); QCOMPARE(oddPhraseQuery->getTerms()[2]->text(), Lucene::String(L"de")); - QCOMPARE(oddPhraseQuery->getPositions()[0], 0); - QCOMPARE(oddPhraseQuery->getPositions()[1], 2); - QCOMPARE(oddPhraseQuery->getPositions()[2], 3); + QCOMPARE(oddPhraseQuery->getPositions()[0], 1); + QCOMPARE(oddPhraseQuery->getPositions()[1], 5); + QCOMPARE(oddPhraseQuery->getPositions()[2], 7); } QObject *create_tst_SearchUtils() diff --git a/src/dfm-search/dfm-search-lib/utils/lucenequeryutils.cpp b/src/dfm-search/dfm-search-lib/utils/lucenequeryutils.cpp index 14832eb3..c4443b3a 100644 --- a/src/dfm-search/dfm-search-lib/utils/lucenequeryutils.cpp +++ b/src/dfm-search/dfm-search-lib/utils/lucenequeryutils.cpp @@ -32,6 +32,14 @@ Lucene::TermPtr buildTerm(const QString &fieldName, const QString &text, bool ca toLuceneString(text, caseSensitive)); } +int phrasePositionForStandardNGram2(int startOffset) +{ + // Standard lucene++ NGramTokenizer(1,2) emits 1-gram then 2-gram at each + // character offset, and every emitted token advances the phrase position by 1. + // Therefore the 2-gram starting at offset i lands at position 2 * i + 1. + return startOffset * 2 + 1; +} + } // namespace std::wstring getLuceneSpecialChars() @@ -88,12 +96,14 @@ Lucene::QueryPtr buildNGramSearchQuery(const QString &fieldName, const QString & Lucene::PhraseQueryPtr phraseQuery = Lucene::newLucene(); for (int pos = 0; pos + 2 <= keyword.size(); pos += 2) { - phraseQuery->add(buildTerm(fieldName, keyword.mid(pos, 2), caseSensitive), pos); + phraseQuery->add(buildTerm(fieldName, keyword.mid(pos, 2), caseSensitive), + phrasePositionForStandardNGram2(pos)); } if (keyword.size() % 2 != 0) { const int tailPos = keyword.size() - 2; - phraseQuery->add(buildTerm(fieldName, keyword.mid(tailPos, 2), caseSensitive), tailPos); + phraseQuery->add(buildTerm(fieldName, keyword.mid(tailPos, 2), caseSensitive), + phrasePositionForStandardNGram2(tailPos)); } return phraseQuery; From 455fb21f4c5bfa7325e28fadced9c9776bc6b2f7 Mon Sep 17 00:00:00 2001 From: Zhang Sheng Date: Sun, 24 May 2026 20:00:48 +0800 Subject: [PATCH 30/36] feat: enhance ContentRetriever with content fetching capabilities MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Added fetchContent() and fetchContents() methods to retrieve full stored content from Lucene index 2. Implemented index directory override and caching mechanism 3. Improved thread safety with mutex protection for index reader operations 4. Added unit tests for new functionality 5. Refactored existing highlight methods to use common internal APIs 6. Added field_names.h kCheckSum constant for future use The changes enable: - Retrieving full document content for both text and OCR search types - Flexible index directory configuration for testing scenarios - Better performance through index reader caching - Thread-safe operations in concurrent environments - Maintainable code structure with shared internal APIs Log: Added content retrieval to text search capabilities Influence: 1. Test fetchContent() with valid and invalid file paths 2. Verify batch content retrieval with fetchContents() 3. Test index directory override functionality 4. Verify thread safety in concurrent access scenarios 5. Check performance with large content sets 6. Verify backward compatibility with existing highlight methods feat: 增强ContentRetriever的内容获取能力 1. 新增fetchContent()和fetchContents()方法用于从Lucene索引获取完整存储 内容 2. 实现索引目录覆盖和缓存机制 3. 通过互斥锁保护提升线程安全性 4. 添加新功能的单元测试 5. 重构现有高亮方法使用通用内部API 6. 在field_names.h中添加kCheckSum常量供未来使用 变更内容包括: - 支持获取文本和OCR搜索类型的完整文档内容 - 灵活的索引目录配置支持测试场景 - 通过索引读取器缓存提升性能 - 并发环境中的线程安全操作 - 使用共享内部API的更可维护代码结构 Log: 新增文本搜索内容获取功能 Influence: 1. 测试fetchContent()方法的有效和无效文件路径 2. 验证fetchContents()批量内容获取功能 3. 测试索引目录覆盖功能 4. 验证并发访问场景下的线程安全性 5. 检查大数据集下的性能表现 6. 验证与现有高亮方法的向后兼容性 --- autotests/dfm-search-tests/main.cpp | 5 + .../tst_content_retriever.cpp | 215 +++++++++++++ .../dfm-search/dfm-search/contentretriever.h | 38 +++ include/dfm-search/dfm-search/field_names.h | 1 + .../dfm-search-lib/utils/contentretriever.cpp | 302 +++++++++++++----- 5 files changed, 484 insertions(+), 77 deletions(-) create mode 100644 autotests/dfm-search-tests/tst_content_retriever.cpp diff --git a/autotests/dfm-search-tests/main.cpp b/autotests/dfm-search-tests/main.cpp index 202a2ea2..c6e7d191 100644 --- a/autotests/dfm-search-tests/main.cpp +++ b/autotests/dfm-search-tests/main.cpp @@ -19,6 +19,7 @@ extern QObject *create_tst_SizeRangeFilter(); extern QObject *create_tst_IsSemanticQuery(); extern QObject *create_tst_SearchTarget(); extern QObject *create_tst_SemanticQueryBuilderTarget(); +extern QObject *create_tst_ContentRetriever(); int main(int argc, char *argv[]) { @@ -81,5 +82,9 @@ int main(int argc, char *argv[]) result |= QTest::qExec(testObj14, argc, argv); delete testObj14; + QObject *testObj15 = create_tst_ContentRetriever(); + result |= QTest::qExec(testObj15, argc, argv); + delete testObj15; + return result; } diff --git a/autotests/dfm-search-tests/tst_content_retriever.cpp b/autotests/dfm-search-tests/tst_content_retriever.cpp new file mode 100644 index 00000000..22a50199 --- /dev/null +++ b/autotests/dfm-search-tests/tst_content_retriever.cpp @@ -0,0 +1,215 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +using namespace dfmsearch; +using namespace Lucene; + +namespace { + +void addStoredDocument(const IndexWriterPtr &writer, + SearchType type, + const QString &path, + const QString &filename, + const QString &content) +{ + DocumentPtr doc = newLucene(); + if (type == SearchType::Ocr) { + doc->add(newLucene(LuceneFieldNames::OcrText::kPath, path.toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::OcrText::kFilename, filename.toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::OcrText::kOcrContents, content.toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + } else { + doc->add(newLucene(LuceneFieldNames::Content::kPath, path.toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::Content::kFilename, filename.toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::Content::kContents, content.toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + } + writer->addDocument(doc); +} + +void createIndex(const QString &indexDir, SearchType type) +{ + QDir().mkpath(indexDir); + IndexWriterPtr writer = newLucene( + FSDirectory::open(indexDir.toStdWString()), + newLucene(), + true, + IndexWriter::MaxFieldLengthLIMITED); + + if (type == SearchType::Content) { + addStoredDocument(writer, type, + "/tmp/doc-a.txt", + "doc-a.txt", + "hello world from content index"); + addStoredDocument(writer, type, + "/tmp/doc-b.txt", + "doc-b.txt", + "meeting notes and budget data"); + } else { + addStoredDocument(writer, type, + "/tmp/img-a.png", + "img-a.png", + "screenshot text from OCR"); + } + + writer->close(); +} + +} // namespace + +class tst_ContentRetriever : public QObject +{ + Q_OBJECT + +private Q_SLOTS: + void fetchContent_single(); + void fetchContent_batch(); + void fetchHighlight_usesTemporaryIndex(); + void concurrentFetch_sharedRetriever(); +}; + +void tst_ContentRetriever::fetchContent_single() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString contentIndexDir = tempDir.path() + "/content-index"; + const QString ocrIndexDir = tempDir.path() + "/ocr-index"; + createIndex(contentIndexDir, SearchType::Content); + createIndex(ocrIndexDir, SearchType::Ocr); + + ContentRetriever retriever; + retriever.setIndexDirectory(SearchType::Content, contentIndexDir); + retriever.setIndexDirectory(SearchType::Ocr, ocrIndexDir); + + QCOMPARE(retriever.fetchContent("/tmp/doc-a.txt", SearchType::Content), + QString("hello world from content index")); + QCOMPARE(retriever.fetchContent("/tmp/img-a.png", SearchType::Ocr), + QString("screenshot text from OCR")); + QVERIFY(retriever.fetchContent("/tmp/missing.txt", SearchType::Content).isEmpty()); +} + +void tst_ContentRetriever::fetchContent_batch() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString contentIndexDir = tempDir.path() + "/content-index"; + createIndex(contentIndexDir, SearchType::Content); + + ContentRetriever retriever; + retriever.setIndexDirectory(SearchType::Content, contentIndexDir); + const QMap results = retriever.fetchContents( + { "/tmp/doc-a.txt", "/tmp/doc-b.txt", "/tmp/missing.txt" }, + SearchType::Content); + + QCOMPARE(results.value("/tmp/doc-a.txt"), QString("hello world from content index")); + QCOMPARE(results.value("/tmp/doc-b.txt"), QString("meeting notes and budget data")); + QVERIFY(results.contains("/tmp/missing.txt")); + QVERIFY(results.value("/tmp/missing.txt").isEmpty()); +} + +void tst_ContentRetriever::fetchHighlight_usesTemporaryIndex() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString contentIndexDir = tempDir.path() + "/content-index"; + createIndex(contentIndexDir, SearchType::Content); + + ContentRetriever retriever; + retriever.setIndexDirectory(SearchType::Content, contentIndexDir); + HighlightOptions options; + options.maxPreviewLength = 80; + + const QString snippet = retriever.fetchHighlight("/tmp/doc-b.txt", + "budget", + SearchType::Content, + options); + QVERIFY(snippet.contains("budget", Qt::CaseInsensitive)); +} + +void tst_ContentRetriever::concurrentFetch_sharedRetriever() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString contentIndexDir = tempDir.path() + "/content-index"; + const QString ocrIndexDir = tempDir.path() + "/ocr-index"; + createIndex(contentIndexDir, SearchType::Content); + createIndex(ocrIndexDir, SearchType::Ocr); + + ContentRetriever retriever; + retriever.setIndexDirectory(SearchType::Content, contentIndexDir); + retriever.setIndexDirectory(SearchType::Ocr, ocrIndexDir); + + HighlightOptions options; + options.maxPreviewLength = 80; + + std::atomic_bool failed { false }; + std::vector> tasks; + tasks.reserve(8); + + for (int worker = 0; worker < 8; ++worker) { + tasks.emplace_back(std::async(std::launch::async, [&retriever, &options, &failed]() { + for (int i = 0; i < 50; ++i) { + if (retriever.fetchContent("/tmp/doc-a.txt", SearchType::Content) + != QString("hello world from content index")) { + failed.store(true); + return; + } + + if (retriever.fetchContent("/tmp/img-a.png", SearchType::Ocr) + != QString("screenshot text from OCR")) { + failed.store(true); + return; + } + + const QString snippet = retriever.fetchHighlight("/tmp/doc-b.txt", + "budget", + SearchType::Content, + options); + if (!snippet.contains("budget", Qt::CaseInsensitive)) { + failed.store(true); + return; + } + } + })); + } + + for (auto &task : tasks) { + task.get(); + } + + QVERIFY(!failed.load()); +} + +QObject *create_tst_ContentRetriever() +{ + return new tst_ContentRetriever(); +} + +#include "tst_content_retriever.moc" diff --git a/include/dfm-search/dfm-search/contentretriever.h b/include/dfm-search/dfm-search/contentretriever.h index cd4d4701..673f38e6 100644 --- a/include/dfm-search/dfm-search/contentretriever.h +++ b/include/dfm-search/dfm-search/contentretriever.h @@ -7,6 +7,7 @@ #include #include #include +#include #include @@ -43,6 +44,20 @@ class ContentRetriever : public QObject explicit ContentRetriever(QObject *parent = nullptr); ~ContentRetriever() override; + /** + * @brief Override the Lucene index directory for a given text search type. + * + * When @p indexDirectory is empty, the default global index directory for + * the given type will be used. This is primarily useful for tests or + * isolated business scenarios that need to point at a temporary index. + */ + void setIndexDirectory(SearchType type, const QString &indexDirectory); + + /** + * @brief Return the effective index directory for the given text search type. + */ + QString indexDirectory(SearchType type) const; + /** * @brief Synchronously fetch highlighted content for a single file * @@ -69,6 +84,29 @@ class ContentRetriever : public QObject const QString &keyword, SearchType type, const HighlightOptions &options = {}) const; + + /** + * @brief Synchronously fetch full stored content for a single file + * + * Opens the Lucene index, locates the document by path, + * and returns the full stored content field. + * + * @param path Absolute file path + * @param type SearchType::Content or SearchType::Ocr + * @return Full content text, or empty string if not found + */ + QString fetchContent(const QString &path, SearchType type) const; + + /** + * @brief Synchronously fetch full stored contents for multiple files + * @return Mapping of path -> full content (empty string if not found) + */ + QMap fetchContents(const QStringList &paths, + SearchType type) const; + +private: + struct Private; + std::unique_ptr d; }; DFM_SEARCH_END_NS diff --git a/include/dfm-search/dfm-search/field_names.h b/include/dfm-search/dfm-search/field_names.h index c1dfb51b..d4fb833f 100644 --- a/include/dfm-search/dfm-search/field_names.h +++ b/include/dfm-search/dfm-search/field_names.h @@ -38,6 +38,7 @@ constexpr const wchar_t kAncestorPaths[] = L"ancestor_paths"; constexpr const wchar_t kBirthTime[] = L"birth_time"; constexpr const wchar_t kModifyTime[] = L"modify_time"; constexpr const wchar_t kFileSize[] = L"file_size"; +constexpr const wchar_t kCheckSum[] = L"checksum"; } // namespace Content // OCR text index field names diff --git a/src/dfm-search/dfm-search-lib/utils/contentretriever.cpp b/src/dfm-search/dfm-search-lib/utils/contentretriever.cpp index 38326581..8dbc7c4a 100644 --- a/src/dfm-search/dfm-search-lib/utils/contentretriever.cpp +++ b/src/dfm-search/dfm-search-lib/utils/contentretriever.cpp @@ -3,21 +3,23 @@ // SPDX-License-Identifier: GPL-3.0-or-later #include -#include #include +#include + #include "utils/contenthighlighter.h" -#include -#include #include +#include +#include +#include -#include -#include -#include +#include +#include #include #include -#include -#include +#include +#include +#include #include using namespace Lucene; @@ -26,9 +28,6 @@ DFM_SEARCH_BEGIN_NS namespace { -/** - * @brief Get the Lucene content field name for the given search type - */ const wchar_t *contentFieldName(SearchType type) { return (type == SearchType::Ocr) @@ -36,9 +35,6 @@ const wchar_t *contentFieldName(SearchType type) : LuceneFieldNames::Content::kContents; } -/** - * @brief Get the Lucene path field name for the given search type - */ const wchar_t *pathFieldName(SearchType type) { return (type == SearchType::Ocr) @@ -46,25 +42,32 @@ const wchar_t *pathFieldName(SearchType type) : LuceneFieldNames::Content::kPath; } -/** - * @brief Get the index directory path for the given search type - */ -QString indexDirectoryForType(SearchType type) +QString defaultIndexDirectoryForType(SearchType type) { return (type == SearchType::Ocr) ? Global::ocrTextIndexDirectory() : Global::contentIndexDirectory(); } -/** - * @brief Split keyword string into a QStringList suitable for ContentHighlighter - * - * Supports comma-separated multi-keyword input, consistent with - * how the search pipeline handles boolean queries. - */ +QString storedContentFromDocument(const DocumentPtr &doc, SearchType type) +{ + if (!doc) { + return {}; + } + + const String contentField = doc->get(contentFieldName(type)); + if (contentField.empty()) { + return {}; + } + + return QString::fromStdWString(contentField); +} + QStringList splitKeywords(const QString &keyword) { - if (keyword.isEmpty()) return {}; + if (keyword.isEmpty()) { + return {}; + } #if QT_VERSION >= QT_VERSION_CHECK(5, 15, 0) return keyword.split(',', Qt::SkipEmptyParts); #else @@ -72,15 +75,123 @@ QStringList splitKeywords(const QString &keyword) #endif } +DocumentPtr findDocumentByPath(const SearcherPtr &searcher, + const QString &path, + SearchType type) +{ + TermPtr term = newLucene(pathFieldName(type), path.toStdWString()); + QueryPtr query = newLucene(term); + + TopDocsPtr topDocs = searcher->search(query, 1); + if (!topDocs || topDocs->totalHits == 0) { + return nullptr; + } + + return searcher->doc(topDocs->scoreDocs[0]->doc); +} + +struct CachedIndexContext +{ + QString indexDirectory; + FSDirectoryPtr directory; + IndexReaderPtr reader; + SearcherPtr searcher; +}; + } // namespace +struct ContentRetriever::Private +{ + QString contentIndexDirectory; + QString ocrIndexDirectory; + mutable QMutex mutex; + mutable QHash cacheByType; + + CachedIndexContext *ensureIndexContext(SearchType type, const QString &indexDir) const + { + CachedIndexContext &ctx = cacheByType[static_cast(type)]; + if (ctx.searcher && ctx.reader && ctx.directory && ctx.indexDirectory == indexDir) { + try { + if (!ctx.reader->isCurrent()) { + IndexReaderPtr reopened = ctx.reader->reopen(true); + if (reopened != ctx.reader) { + ctx.reader = reopened; + ctx.searcher = newLucene(ctx.reader); + } + } + return &ctx; + } catch (const LuceneException &e) { + qWarning() << "ContentRetriever: failed to refresh index reader" + << QString::fromStdWString(e.getError()); + ctx = {}; + } catch (const std::exception &e) { + qWarning() << "ContentRetriever: failed to refresh index reader" << e.what(); + ctx = {}; + } + } + + try { + ctx.indexDirectory = indexDir; + ctx.directory = FSDirectory::open(indexDir.toStdWString()); + if (!IndexReader::indexExists(ctx.directory)) { + ctx = {}; + return nullptr; + } + + ctx.reader = IndexReader::open(ctx.directory, true); + ctx.searcher = newLucene(ctx.reader); + return &ctx; + } catch (const LuceneException &e) { + qWarning() << "ContentRetriever: failed to open index" + << QString::fromStdWString(e.getError()); + } catch (const std::exception &e) { + qWarning() << "ContentRetriever: failed to open index" << e.what(); + } + + ctx = {}; + return nullptr; + } +}; + ContentRetriever::ContentRetriever(QObject *parent) - : QObject(parent) + : QObject(parent), + d(std::make_unique()) { } ContentRetriever::~ContentRetriever() = default; +void ContentRetriever::setIndexDirectory(SearchType type, const QString &indexDirectory) +{ + if (type != SearchType::Content && type != SearchType::Ocr) { + return; + } + + QMutexLocker locker(&d->mutex); + if (type == SearchType::Ocr) { + d->ocrIndexDirectory = indexDirectory; + } else { + d->contentIndexDirectory = indexDirectory; + } + d->cacheByType.remove(static_cast(type)); +} + +QString ContentRetriever::indexDirectory(SearchType type) const +{ + if (type == SearchType::Ocr) { + return d->ocrIndexDirectory.isEmpty() + ? defaultIndexDirectoryForType(type) + : d->ocrIndexDirectory; + } + if (type == SearchType::Content) { + return d->contentIndexDirectory.isEmpty() + ? defaultIndexDirectoryForType(type) + : d->contentIndexDirectory; + } + + return {}; +} + QString ContentRetriever::fetchHighlight(const QString &path, const QString &keyword, SearchType type, @@ -89,33 +200,26 @@ QString ContentRetriever::fetchHighlight(const QString &path, if (path.isEmpty() || keyword.isEmpty()) return {}; if (type != SearchType::Content && type != SearchType::Ocr) return {}; - QStringList keywords = splitKeywords(keyword); + const QStringList keywords = splitKeywords(keyword); if (keywords.isEmpty()) return {}; - try { - const QString indexDir = indexDirectoryForType(type); - IndexReaderPtr reader = IndexReader::open(FSDirectory::open(indexDir.toStdWString())); - IndexSearcherPtr searcher = newLucene(reader); - - // Build a term query on the path field to find the exact document - TermPtr term = newLucene(pathFieldName(type), path.toStdWString()); - QueryPtr query = newLucene(term); + const QString indexDir = indexDirectory(type); - TopDocsPtr topDocs = searcher->search(query, 1); - if (!topDocs || topDocs->totalHits == 0) { - return {}; - } + QMutexLocker locker(&d->mutex); + CachedIndexContext *ctx = d->ensureIndexContext(type, indexDir); + if (!ctx || !ctx->searcher) { + return {}; + } - DocumentPtr doc = searcher->doc(topDocs->scoreDocs[0]->doc); - String contentField = doc->get(contentFieldName(type)); - if (contentField.empty()) { + try { + const DocumentPtr doc = findDocumentByPath(ctx->searcher, path, type); + const QString content = storedContentFromDocument(doc, type); + if (content.isEmpty()) { return {}; } - const QString content = QString::fromStdWString(contentField); return ContentHighlighter::customHighlight( keywords, content, options.maxPreviewLength, options.enableHtml); - } catch (const LuceneException &e) { qWarning() << "ContentRetriever: error fetching highlight for" << path << QString::fromStdWString(e.getError()); @@ -135,48 +239,92 @@ QMap ContentRetriever::fetchHighlights(const QStringList &path if (paths.isEmpty() || keyword.isEmpty()) return results; if (type != SearchType::Content && type != SearchType::Ocr) return results; - QStringList keywords = splitKeywords(keyword); + const QStringList keywords = splitKeywords(keyword); if (keywords.isEmpty()) return results; - try { - // Open index reader once for all paths - const QString indexDir = indexDirectoryForType(type); - IndexReaderPtr reader = IndexReader::open(FSDirectory::open(indexDir.toStdWString())); - IndexSearcherPtr searcher = newLucene(reader); + const QString indexDir = indexDirectory(type); - for (const QString &path : paths) { - try { - TermPtr term = newLucene(pathFieldName(type), path.toStdWString()); - QueryPtr query = newLucene(term); - - TopDocsPtr topDocs = searcher->search(query, 1); - if (!topDocs || topDocs->totalHits == 0) { - results.insert(path, {}); - continue; - } - - DocumentPtr doc = searcher->doc(topDocs->scoreDocs[0]->doc); - String contentField = doc->get(contentFieldName(type)); - if (contentField.empty()) { - results.insert(path, {}); - continue; - } - - const QString content = QString::fromStdWString(contentField); - results.insert(path, ContentHighlighter::customHighlight(keywords, content, options.maxPreviewLength, options.enableHtml)); + QMutexLocker locker(&d->mutex); + CachedIndexContext *ctx = d->ensureIndexContext(type, indexDir); + if (!ctx || !ctx->searcher) { + return results; + } - } catch (const LuceneException &e) { - qWarning() << "ContentRetriever: error for" << path - << QString::fromStdWString(e.getError()); - results.insert(path, {}); - } catch (const std::exception &e) { - qWarning() << "ContentRetriever: std error for" << path << e.what(); + for (const QString &path : paths) { + try { + const DocumentPtr doc = findDocumentByPath(ctx->searcher, path, type); + const QString content = storedContentFromDocument(doc, type); + if (content.isEmpty()) { results.insert(path, {}); + continue; } + + results.insert(path, ContentHighlighter::customHighlight( + keywords, content, options.maxPreviewLength, options.enableHtml)); + } catch (const LuceneException &e) { + qWarning() << "ContentRetriever: error for" << path + << QString::fromStdWString(e.getError()); + results.insert(path, {}); + } catch (const std::exception &e) { + qWarning() << "ContentRetriever: std error for" << path << e.what(); + results.insert(path, {}); } + } + + return results; +} + +QString ContentRetriever::fetchContent(const QString &path, SearchType type) const +{ + if (path.isEmpty()) return {}; + if (type != SearchType::Content && type != SearchType::Ocr) return {}; + + const QString indexDir = indexDirectory(type); + + QMutexLocker locker(&d->mutex); + CachedIndexContext *ctx = d->ensureIndexContext(type, indexDir); + if (!ctx || !ctx->searcher) { + return {}; + } + + try { + return storedContentFromDocument(findDocumentByPath(ctx->searcher, path, type), type); } catch (const LuceneException &e) { - qWarning() << "ContentRetriever: failed to open index" + qWarning() << "ContentRetriever: error fetching content for" << path << QString::fromStdWString(e.getError()); + return {}; + } catch (const std::exception &e) { + qWarning() << "ContentRetriever: std error for" << path << e.what(); + return {}; + } +} + +QMap ContentRetriever::fetchContents(const QStringList &paths, + SearchType type) const +{ + QMap results; + if (paths.isEmpty()) return results; + if (type != SearchType::Content && type != SearchType::Ocr) return results; + + const QString indexDir = indexDirectory(type); + + QMutexLocker locker(&d->mutex); + CachedIndexContext *ctx = d->ensureIndexContext(type, indexDir); + if (!ctx || !ctx->searcher) { + return results; + } + + for (const QString &path : paths) { + try { + results.insert(path, storedContentFromDocument(findDocumentByPath(ctx->searcher, path, type), type)); + } catch (const LuceneException &e) { + qWarning() << "ContentRetriever: error for" << path + << QString::fromStdWString(e.getError()); + results.insert(path, {}); + } catch (const std::exception &e) { + qWarning() << "ContentRetriever: std error for" << path << e.what(); + results.insert(path, {}); + } } return results; From 388821009a33d6c3612c58673cdeace9fa132f87 Mon Sep 17 00:00:00 2001 From: Zhang Sheng Date: Sun, 24 May 2026 21:49:13 +0800 Subject: [PATCH 31/36] test: add test utility libraries for content search MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Added elfio library for ELF file parsing and manipulation 2. Implemented addr_any.h for address resolution across executable sections 3. Added addr_pri.h for accessing private class members 4. Added stub.h for function hooking/replacement 5. Added stub-ext extensions with stub-shadow support 6. Integrated utilities into test framework Log: Implemented test utilities including elf parsing, function hooking and private member access for content search testing Influence: 1. Test content search index building and querying 2. Verify search result accuracy with controlled inputs 3. Test private API behaviors through reflection 4. Validate text and OCR search functionality test: 添加内容搜索测试工具库 1. 新增elfio库用于ELF文件解析和操作 2. 实现addr_any.h用于跨可执行段地址解析 3. 添加addr_pri.h用于访问私有类成员 4. 添加stub.h用于函数钩子/替换 5. 添加带stub-shadow支持的stub-ext扩展 6. 将工具集成到测试框架中 Log: 实现了内容搜索测试工具,包括ELF解析、函数钩子和私有成员访问 Influence: 1. 测试内容搜索索引构建和查询 2. 使用受控输入验证搜索结果准确性 3. 通过反射测试私有API行为 4. 验证文本和OCR搜索功能 --- 3rdparty/.gitkeep | 0 3rdparty/testutils/cpp-stub/addr_any.h | 280 + 3rdparty/testutils/cpp-stub/addr_pri.h | 177 + 3rdparty/testutils/cpp-stub/elfio.hpp | 4888 +++++++++++++++++ 3rdparty/testutils/cpp-stub/stub.h | 360 ++ 3rdparty/testutils/stub-ext/stub-shadow.cpp | 35 + 3rdparty/testutils/stub-ext/stub-shadow.h | 166 + 3rdparty/testutils/stub-ext/stubext.h | 107 + autotests/dfm-search-tests/CMakeLists.txt | 3 + autotests/dfm-search-tests/main.cpp | 5 + .../tst_content_search_engine.cpp | 495 ++ 11 files changed, 6516 insertions(+) create mode 100644 3rdparty/.gitkeep create mode 100644 3rdparty/testutils/cpp-stub/addr_any.h create mode 100644 3rdparty/testutils/cpp-stub/addr_pri.h create mode 100644 3rdparty/testutils/cpp-stub/elfio.hpp create mode 100644 3rdparty/testutils/cpp-stub/stub.h create mode 100644 3rdparty/testutils/stub-ext/stub-shadow.cpp create mode 100644 3rdparty/testutils/stub-ext/stub-shadow.h create mode 100644 3rdparty/testutils/stub-ext/stubext.h create mode 100644 autotests/dfm-search-tests/tst_content_search_engine.cpp diff --git a/3rdparty/.gitkeep b/3rdparty/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/3rdparty/testutils/cpp-stub/addr_any.h b/3rdparty/testutils/cpp-stub/addr_any.h new file mode 100644 index 00000000..a153f348 --- /dev/null +++ b/3rdparty/testutils/cpp-stub/addr_any.h @@ -0,0 +1,280 @@ +#ifndef __ADDR_ANY_H__ +#define __ADDR_ANY_H__ + + +//linux +#include +#include +//c +#include +#include +#include + +//c++ +#include +#include +//project +#include "elfio.hpp" + + + +class AddrAny +{ +public: + AddrAny() + { + m_init = get_exe_pathname(m_fullname); + m_baseaddr = 0; + } + AddrAny(std::string libname) + { + m_init = get_lib_pathname_and_baseaddr(libname, m_fullname, m_baseaddr); + } + + int get_local_func_addr_symtab(std::string func_name_regex_str, std::map& result) + { + return get_func_addr(SHT_SYMTAB, STB_LOCAL, func_name_regex_str, result); + } + int get_global_func_addr_symtab(std::string func_name_regex_str, std::map& result) + { + return get_func_addr(SHT_SYMTAB, STB_GLOBAL, func_name_regex_str, result); + } + int get_weak_func_addr_symtab(std::string func_name_regex_str, std::map& result) + { + return get_func_addr(SHT_SYMTAB, STB_WEAK, func_name_regex_str, result); + } + + int get_global_func_addr_dynsym( std::string func_name_regex_str, std::map& result) + { + return get_func_addr(SHT_DYNSYM, STB_GLOBAL, func_name_regex_str, result); + } + int get_weak_func_addr_dynsym(std::string func_name_regex_str, std::map& result) + { + return get_func_addr(SHT_DYNSYM, STB_WEAK, func_name_regex_str, result); + } + +private: + bool demangle(std::string& s, std::string& name) { + int status; + char* pname = abi::__cxa_demangle(s.c_str(), 0, 0, &status); + if (status != 0) + { + switch(status) + { + case -1: name = "memory allocation error"; break; + case -2: name = "invalid name given"; break; + case -3: name = "internal error: __cxa_demangle: invalid argument"; break; + default: name = "unknown error occured"; break; + } + return false; + } + name = pname; + free(pname); + return true; + } + bool get_exe_pathname( std::string& name) + { + char line[512]; + FILE *fp; + uintptr_t base_addr; + char perm[5]; + unsigned long offset; + int pathname_pos; + char *pathname; + size_t pathname_len; + int match = 0; + + if(NULL == (fp = fopen("/proc/self/maps", "r"))) + { + return false; + } + + while(fgets(line, sizeof(line), fp)) + { + if(sscanf(line, "%" PRIxPTR "-%*lx %4s %lx %*x:%*x %*d%n", &base_addr, perm, &offset, &pathname_pos) != 3) continue; + + if(0 != offset) continue; + + //get pathname + while(isspace(line[pathname_pos]) && pathname_pos < (int)(sizeof(line) - 1)) + pathname_pos += 1; + if(pathname_pos >= (int)(sizeof(line) - 1)) continue; + pathname = line + pathname_pos; + pathname_len = strlen(pathname); + if(0 == pathname_len) continue; + if(pathname[pathname_len - 1] == '\n') + { + pathname[pathname_len - 1] = '\0'; + pathname_len -= 1; + } + if(0 == pathname_len) continue; + if('[' == pathname[0]) continue; + + name = pathname; + match = 1; + break; + + } + fclose(fp); + + if(0 == match) + { + return false; + } + else + { + return true; + } + + } + + bool get_lib_pathname_and_baseaddr(std::string pathname_regex_str, std::string& name, unsigned long& addr) + { + char line[512]; + FILE *fp; + uintptr_t base_addr; + char perm[5]; + unsigned long offset; + int pathname_pos; + char *pathname; + size_t pathname_len; + int match; + regex_t pathname_regex; + + regcomp(&pathname_regex, pathname_regex_str.c_str(), 0); + + if(NULL == (fp = fopen("/proc/self/maps", "r"))) + { + return false; + } + + while(fgets(line, sizeof(line), fp)) + { + if(sscanf(line, "%" PRIxPTR "-%*lx %4s %lx %*x:%*x %*d%n", &base_addr, perm, &offset, &pathname_pos) != 3) continue; + + //check permission + if(perm[0] != 'r') continue; + if(perm[3] != 'p') continue; //do not touch the shared memory + + //check offset + // + //We are trying to find ELF header in memory. + //It can only be found at the beginning of a mapped memory regions + //whose offset is 0. + if(0 != offset) continue; + + //get pathname + while(isspace(line[pathname_pos]) && pathname_pos < (int)(sizeof(line) - 1)) + pathname_pos += 1; + if(pathname_pos >= (int)(sizeof(line) - 1)) continue; + pathname = line + pathname_pos; + pathname_len = strlen(pathname); + if(0 == pathname_len) continue; + if(pathname[pathname_len - 1] == '\n') + { + pathname[pathname_len - 1] = '\0'; + pathname_len -= 1; + } + if(0 == pathname_len) continue; + if('[' == pathname[0]) continue; + + //check pathname + //if we need to hook this elf? + match = 0; + if(0 == regexec(&pathname_regex, pathname, 0, NULL, 0)) + { + match = 1; + name = pathname; + addr = (unsigned long)base_addr; + break; + } + if(0 == match) continue; + + } + fclose(fp); + if(0 == match) + { + return false; + } + else + { + return true; + } + + } + + int get_func_addr(unsigned int ttype, unsigned int stype, std::string& func_name_regex_str, std::map& result) + { + // Create an elfio reader + ELFIO::elfio reader; + int count = 0; + regex_t pathname_regex; + + if(!m_init) + { + return -1; + } + + regcomp(&pathname_regex, func_name_regex_str.c_str(), 0); + // Load ELF data + if(!reader.load(m_fullname.c_str())) + { + return -1; + } + + ELFIO::Elf_Half sec_num = reader.sections.size(); + for(int i = 0; i < sec_num; ++i) + { + ELFIO::section* psec = reader.sections[i]; + // Check section type + if(psec->get_type() == ttype) + { + const ELFIO::symbol_section_accessor symbols( reader, psec ); + for ( unsigned int j = 0; j < symbols.get_symbols_num(); ++j ) + { + std::string name; + std::string name_mangle; + ELFIO::Elf64_Addr value; + ELFIO::Elf_Xword size; + unsigned char bind; + unsigned char type; + ELFIO::Elf_Half section_index; + unsigned char other; + + // Read symbol properties + symbols.get_symbol( j, name, value, size, bind, type, section_index, other ); + if(type == STT_FUNC && bind == stype) + { + bool ret = demangle(name,name_mangle); + if(ret == true) + { + if (0 == regexec(&pathname_regex, name_mangle.c_str(), 0, NULL, 0)) + { + result.insert ( std::pair(name_mangle,(void*)(value + m_baseaddr))); + count++; + } + } + else + { + if (0 == regexec(&pathname_regex, name.c_str(), 0, NULL, 0)) + { + result.insert ( std::pair(name,(void*)(value + m_baseaddr))); + count++; + } + } + } + } + break; + } + } + + return count; + } +private: + bool m_init; + std::string m_name; + std::string m_fullname; + unsigned long m_baseaddr; + +}; +#endif diff --git a/3rdparty/testutils/cpp-stub/addr_pri.h b/3rdparty/testutils/cpp-stub/addr_pri.h new file mode 100644 index 00000000..9174bb0c --- /dev/null +++ b/3rdparty/testutils/cpp-stub/addr_pri.h @@ -0,0 +1,177 @@ +#ifndef __ADDR_PRI_H__ +#define __ADDR_PRI_H__ + + +#include +#include + + + +//base on C++11 + +/********************************************************** + access private function +**********************************************************/ + + +namespace std { + template + using enable_if_t = typename enable_if::type; + template + using remove_reference_t = typename remove_reference::type; +} // std + +// Unnamed namespace is used to avoid duplicate symbols if the macros are used +namespace { + namespace private_access_detail { + + // @tparam TagType, used to declare different "get" funciton overloads for + // different members/statics + template + struct private_access { + // Normal lookup cannot find in-class defined (inline) friend functions. + friend PtrType get(TagType) { return PtrValue; } + }; + + } // namespace private_access_detail +} // namespace + +// Used macro naming conventions: +// The "namespace" of this macro library is PRIVATE_ACCESS, i.e. all +// macro here has this prefix. +// All implementation macro, which are not meant to be used directly have the +// PRIVATE_ACCESS_DETAIL prefix. +// Some macros have the ABCD_IMPL form, which means they contain the +// implementation details for the specific ABCD macro. + +#define PRIVATE_ACCESS_DETAIL_CONCATENATE_IMPL(x, y) x##y +#define PRIVATE_ACCESS_DETAIL_CONCATENATE(x, y) \ + PRIVATE_ACCESS_DETAIL_CONCATENATE_IMPL(x, y) + +// @param PtrTypeKind E.g if we have "class A", then it can be "A::*" in case of +// members, or it can be "*" in case of statics. +#define PRIVATE_ACCESS_DETAIL_ACCESS_PRIVATE(Tag, Class, Type, Name, \ + PtrTypeKind) \ + namespace { \ + namespace private_access_detail { \ + /* Tag type, used to declare different get funcitons for different \ + * members \ + */ \ + struct Tag {}; \ + /* Explicit instantiation */ \ + template struct private_access; \ + /* We can build the PtrType only with two aliases */ \ + /* E.g. using PtrType = int(int) *; would be illformed */ \ + using PRIVATE_ACCESS_DETAIL_CONCATENATE(Alias_, Tag) = Type; \ + using PRIVATE_ACCESS_DETAIL_CONCATENATE(PtrType_, Tag) = \ + PRIVATE_ACCESS_DETAIL_CONCATENATE(Alias_, Tag) PtrTypeKind; \ + /* Declare the friend function, now it is visible in namespace scope. \ + * Note, \ + * we could declare it inside the Tag type too, in that case ADL would \ + * find \ + * the declaration. By choosing to declare it here, the Tag type remains \ + * a \ + * simple tag type, it has no other responsibilities. */ \ + PRIVATE_ACCESS_DETAIL_CONCATENATE(PtrType_, Tag) get(Tag); \ + } \ + } + +#define PRIVATE_ACCESS_DETAIL_ACCESS_PRIVATE_FIELD(Tag, Class, Type, Name) \ + PRIVATE_ACCESS_DETAIL_ACCESS_PRIVATE(Tag, Class, Type, Name, Class::*) \ + namespace { \ + namespace access_private_field { \ + Type &Class##Name(Class &&t) { return t.*get(private_access_detail::Tag{}); } \ + Type &Class##Name(Class &t) { return t.*get(private_access_detail::Tag{}); } \ + /* The following usings are here to avoid duplicate const qualifier \ + * warnings \ + */ \ + using PRIVATE_ACCESS_DETAIL_CONCATENATE(X, Tag) = Type; \ + using PRIVATE_ACCESS_DETAIL_CONCATENATE(Y, Tag) = \ + const PRIVATE_ACCESS_DETAIL_CONCATENATE(X, Tag); \ + PRIVATE_ACCESS_DETAIL_CONCATENATE(Y, Tag) & Class##Name(const Class &t) {\ + return t.*get(private_access_detail::Tag{}); \ + } \ + } \ + } + +#define PRIVATE_ACCESS_DETAIL_ACCESS_PRIVATE_FUN(Tag, Class, Type, Name) \ + PRIVATE_ACCESS_DETAIL_ACCESS_PRIVATE(Tag, Class, Type, Name, Class::*) \ + namespace { \ + namespace call_private_fun { \ + /* We do perfect forwarding, but we want to restrict the overload set \ + * only for objects which have the type Class. */ \ + template , \ + Class>::value> * = nullptr, \ + typename... Args> \ + auto Class##Name(Obj &&o, Args &&... args) -> decltype( \ + (std::forward(o).* \ + get(private_access_detail::Tag{}))(std::forward(args)...)) { \ + return (std::forward(o).*get(private_access_detail::Tag{}))( \ + std::forward(args)...); \ + } \ + } \ + namespace get_private_fun { \ + auto Class##Name() -> decltype( \ + get(private_access_detail::Tag{})) { \ + return (get(private_access_detail::Tag{})); \ + } \ + } \ + } + +#define PRIVATE_ACCESS_DETAIL_ACCESS_PRIVATE_STATIC_FIELD(Tag, Class, Type, \ + Name) \ + PRIVATE_ACCESS_DETAIL_ACCESS_PRIVATE(Tag, Class, Type, Name, *) \ + namespace { \ + namespace access_private_static_field { \ + namespace Class { \ + Type &Class##Name() { return *get(private_access_detail::Tag{}); } \ + } \ + } \ + } + +#define PRIVATE_ACCESS_DETAIL_ACCESS_PRIVATE_STATIC_FUN(Tag, Class, Type, \ + Name) \ + PRIVATE_ACCESS_DETAIL_ACCESS_PRIVATE(Tag, Class, Type, Name, *) \ + namespace { \ + namespace call_private_static_fun { \ + namespace Class { \ + template \ + auto Class##Name(Args &&... args) -> decltype( \ + get(private_access_detail::Tag{})(std::forward(args)...)) { \ + return get(private_access_detail::Tag{})( \ + std::forward(args)...); \ + } \ + } \ + } \ + namespace get_private_static_fun { \ + namespace Class { \ + auto Class##Name() -> decltype(get(private_access_detail::Tag{})) { \ + return get(private_access_detail::Tag{}); \ + } \ + } \ + } \ + } + +#define PRIVATE_ACCESS_DETAIL_UNIQUE_TAG \ + PRIVATE_ACCESS_DETAIL_CONCATENATE(PrivateAccessTag, __COUNTER__) + +#define ACCESS_PRIVATE_FIELD(Class, Type, Name) \ + PRIVATE_ACCESS_DETAIL_ACCESS_PRIVATE_FIELD(PRIVATE_ACCESS_DETAIL_UNIQUE_TAG, \ + Class, Type, Name) + +#define ACCESS_PRIVATE_FUN(Class, Type, Name) \ + PRIVATE_ACCESS_DETAIL_ACCESS_PRIVATE_FUN(PRIVATE_ACCESS_DETAIL_UNIQUE_TAG, \ + Class, Type, Name) + +#define ACCESS_PRIVATE_STATIC_FIELD(Class, Type, Name) \ + Type Class::Name; \ + PRIVATE_ACCESS_DETAIL_ACCESS_PRIVATE_STATIC_FIELD( \ + PRIVATE_ACCESS_DETAIL_UNIQUE_TAG, Class, Type, Name) + +#define ACCESS_PRIVATE_STATIC_FUN(Class, Type, Name) \ + PRIVATE_ACCESS_DETAIL_ACCESS_PRIVATE_STATIC_FUN( \ + PRIVATE_ACCESS_DETAIL_UNIQUE_TAG, Class, Type, Name) + +#endif diff --git a/3rdparty/testutils/cpp-stub/elfio.hpp b/3rdparty/testutils/cpp-stub/elfio.hpp new file mode 100644 index 00000000..dd5c9aec --- /dev/null +++ b/3rdparty/testutils/cpp-stub/elfio.hpp @@ -0,0 +1,4888 @@ + +/*** Start of inlined file: elfio_dump.hpp ***/ +#ifndef ELFIO_DUMP_HPP +#define ELFIO_DUMP_HPP + +#include +#include +#include +#include +#include + +/*** Start of inlined file: elfio.hpp ***/ +#ifndef ELFIO_HPP +#define ELFIO_HPP + +#ifdef _MSC_VER +#pragma warning( push ) +#pragma warning( disable : 4996 ) +#pragma warning( disable : 4355 ) +#pragma warning( disable : 4244 ) +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + + +/*** Start of inlined file: elf_types.hpp ***/ +#ifndef ELFTYPES_H +#define ELFTYPES_H + +#ifndef ELFIO_NO_OWN_TYPES +#if !defined( ELFIO_NO_CSTDINT ) && !defined( ELFIO_NO_INTTYPES ) +#include +#else +typedef unsigned char uint8_t; +typedef signed char int8_t; +typedef unsigned short uint16_t; +typedef signed short int16_t; +#ifdef _MSC_VER +typedef unsigned __int32 uint32_t; +typedef signed __int32 int32_t; +typedef unsigned __int64 uint64_t; +typedef signed __int64 int64_t; +#else +typedef unsigned int uint32_t; +typedef signed int int32_t; +typedef unsigned long long uint64_t; +typedef signed long long int64_t; +#endif // _MSC_VER +#endif // ELFIO_NO_CSTDINT +#endif // ELFIO_NO_OWN_TYPES + +namespace ELFIO { + +// Attention! Platform depended definitions. +typedef uint16_t Elf_Half; +typedef uint32_t Elf_Word; +typedef int32_t Elf_Sword; +typedef uint64_t Elf_Xword; +typedef int64_t Elf_Sxword; + +typedef uint32_t Elf32_Addr; +typedef uint32_t Elf32_Off; +typedef uint64_t Elf64_Addr; +typedef uint64_t Elf64_Off; + +#define Elf32_Half Elf_Half +#define Elf64_Half Elf_Half +#define Elf32_Word Elf_Word +#define Elf64_Word Elf_Word +#define Elf32_Sword Elf_Sword +#define Elf64_Sword Elf_Sword + +/////////////////////// +// ELF Header Constants + +// File type +#define ET_NONE 0 +#define ET_REL 1 +#define ET_EXEC 2 +#define ET_DYN 3 +#define ET_CORE 4 +#define ET_LOOS 0xFE00 +#define ET_HIOS 0xFEFF +#define ET_LOPROC 0xFF00 +#define ET_HIPROC 0xFFFF + +#define EM_NONE 0 // No machine +#define EM_M32 1 // AT&T WE 32100 +#define EM_SPARC 2 // SUN SPARC +#define EM_386 3 // Intel 80386 +#define EM_68K 4 // Motorola m68k family +#define EM_88K 5 // Motorola m88k family +#define EM_486 6 // Intel 80486// Reserved for future use +#define EM_860 7 // Intel 80860 +#define EM_MIPS 8 // MIPS R3000 (officially, big-endian only) +#define EM_S370 9 // IBM System/370 +#define EM_MIPS_RS3_LE \ + 10 // MIPS R3000 little-endian (Oct 4 1999 Draft) Deprecated +#define EM_res011 11 // Reserved +#define EM_res012 12 // Reserved +#define EM_res013 13 // Reserved +#define EM_res014 14 // Reserved +#define EM_PARISC 15 // HPPA +#define EM_res016 16 // Reserved +#define EM_VPP550 17 // Fujitsu VPP500 +#define EM_SPARC32PLUS 18 // Sun's "v8plus" +#define EM_960 19 // Intel 80960 +#define EM_PPC 20 // PowerPC +#define EM_PPC64 21 // 64-bit PowerPC +#define EM_S390 22 // IBM S/390 +#define EM_SPU 23 // Sony/Toshiba/IBM SPU +#define EM_res024 24 // Reserved +#define EM_res025 25 // Reserved +#define EM_res026 26 // Reserved +#define EM_res027 27 // Reserved +#define EM_res028 28 // Reserved +#define EM_res029 29 // Reserved +#define EM_res030 30 // Reserved +#define EM_res031 31 // Reserved +#define EM_res032 32 // Reserved +#define EM_res033 33 // Reserved +#define EM_res034 34 // Reserved +#define EM_res035 35 // Reserved +#define EM_V800 36 // NEC V800 series +#define EM_FR20 37 // Fujitsu FR20 +#define EM_RH32 38 // TRW RH32 +#define EM_MCORE 39 // Motorola M*Core // May also be taken by Fujitsu MMA +#define EM_RCE 39 // Old name for MCore +#define EM_ARM 40 // ARM +#define EM_OLD_ALPHA 41 // Digital Alpha +#define EM_SH 42 // Renesas (formerly Hitachi) / SuperH SH +#define EM_SPARCV9 43 // SPARC v9 64-bit +#define EM_TRICORE 44 // Siemens Tricore embedded processor +#define EM_ARC 45 // ARC Cores +#define EM_H8_300 46 // Renesas (formerly Hitachi) H8/300 +#define EM_H8_300H 47 // Renesas (formerly Hitachi) H8/300H +#define EM_H8S 48 // Renesas (formerly Hitachi) H8S +#define EM_H8_500 49 // Renesas (formerly Hitachi) H8/500 +#define EM_IA_64 50 // Intel IA-64 Processor +#define EM_MIPS_X 51 // Stanford MIPS-X +#define EM_COLDFIRE 52 // Motorola Coldfire +#define EM_68HC12 53 // Motorola M68HC12 +#define EM_MMA 54 // Fujitsu Multimedia Accelerator +#define EM_PCP 55 // Siemens PCP +#define EM_NCPU 56 // Sony nCPU embedded RISC processor +#define EM_NDR1 57 // Denso NDR1 microprocesspr +#define EM_STARCORE 58 // Motorola Star*Core processor +#define EM_ME16 59 // Toyota ME16 processor +#define EM_ST100 60 // STMicroelectronics ST100 processor +#define EM_TINYJ 61 // Advanced Logic Corp. TinyJ embedded processor +#define EM_X86_64 62 // Advanced Micro Devices X86-64 processor +#define EM_PDSP 63 // Sony DSP Processor +#define EM_PDP10 64 // Digital Equipment Corp. PDP-10 +#define EM_PDP11 65 // Digital Equipment Corp. PDP-11 +#define EM_FX66 66 // Siemens FX66 microcontroller +#define EM_ST9PLUS 67 // STMicroelectronics ST9+ 8/16 bit microcontroller +#define EM_ST7 68 // STMicroelectronics ST7 8-bit microcontroller +#define EM_68HC16 69 // Motorola MC68HC16 Microcontroller +#define EM_68HC11 70 // Motorola MC68HC11 Microcontroller +#define EM_68HC08 71 // Motorola MC68HC08 Microcontroller +#define EM_68HC05 72 // Motorola MC68HC05 Microcontroller +#define EM_SVX 73 // Silicon Graphics SVx +#define EM_ST19 74 // STMicroelectronics ST19 8-bit cpu +#define EM_VAX 75 // Digital VAX +#define EM_CRIS 76 // Axis Communications 32-bit embedded processor +#define EM_JAVELIN 77 // Infineon Technologies 32-bit embedded cpu +#define EM_FIREPATH 78 // Element 14 64-bit DSP processor +#define EM_ZSP 79 // LSI Logic's 16-bit DSP processor +#define EM_MMIX 80 // Donald Knuth's educational 64-bit processor +#define EM_HUANY 81 // Harvard's machine-independent format +#define EM_PRISM 82 // SiTera Prism +#define EM_AVR 83 // Atmel AVR 8-bit microcontroller +#define EM_FR30 84 // Fujitsu FR30 +#define EM_D10V 85 // Mitsubishi D10V +#define EM_D30V 86 // Mitsubishi D30V +#define EM_V850 87 // NEC v850 +#define EM_M32R 88 // Renesas M32R (formerly Mitsubishi M32R) +#define EM_MN10300 89 // Matsushita MN10300 +#define EM_MN10200 90 // Matsushita MN10200 +#define EM_PJ 91 // picoJava +#define EM_OPENRISC 92 // OpenRISC 32-bit embedded processor +#define EM_ARC_A5 93 // ARC Cores Tangent-A5 +#define EM_XTENSA 94 // Tensilica Xtensa Architecture +#define EM_VIDEOCORE 95 // Alphamosaic VideoCore processor +#define EM_TMM_GPP 96 // Thompson Multimedia General Purpose Processor +#define EM_NS32K 97 // National Semiconductor 32000 series +#define EM_TPC 98 // Tenor Network TPC processor +#define EM_SNP1K 99 // Trebia SNP 1000 processor +#define EM_ST200 100 // STMicroelectronics ST200 microcontroller +#define EM_IP2K 101 // Ubicom IP2022 micro controller +#define EM_MAX 102 // MAX Processor +#define EM_CR 103 // National Semiconductor CompactRISC +#define EM_F2MC16 104 // Fujitsu F2MC16 +#define EM_MSP430 105 // TI msp430 micro controller +#define EM_BLACKFIN 106 // ADI Blackfin +#define EM_SE_C33 107 // S1C33 Family of Seiko Epson processors +#define EM_SEP 108 // Sharp embedded microprocessor +#define EM_ARCA 109 // Arca RISC Microprocessor +#define EM_UNICORE \ + 110 // Microprocessor series from PKU-Unity Ltd. and MPRC of Peking University +#define EM_EXCESS 111 // eXcess: 16/32/64-bit configurable embedded CPU +#define EM_DXP 112 // Icera Semiconductor Inc. Deep Execution Processor +#define EM_ALTERA_NIOS2 113 // Altera Nios II soft-core processor +#define EM_CRX 114 // National Semiconductor CRX +#define EM_XGATE 115 // Motorola XGATE embedded processor +#define EM_C166 116 // Infineon C16x/XC16x processor +#define EM_M16C 117 // Renesas M16C series microprocessors +#define EM_DSPIC30F \ + 118 // Microchip Technology dsPIC30F Digital Signal Controller +#define EM_CE 119 // Freescale Communication Engine RISC core +#define EM_M32C 120 // Renesas M32C series microprocessors +#define EM_res121 121 // Reserved +#define EM_res122 122 // Reserved +#define EM_res123 123 // Reserved +#define EM_res124 124 // Reserved +#define EM_res125 125 // Reserved +#define EM_res126 126 // Reserved +#define EM_res127 127 // Reserved +#define EM_res128 128 // Reserved +#define EM_res129 129 // Reserved +#define EM_res130 130 // Reserved +#define EM_TSK3000 131 // Altium TSK3000 core +#define EM_RS08 132 // Freescale RS08 embedded processor +#define EM_res133 133 // Reserved +#define EM_ECOG2 134 // Cyan Technology eCOG2 microprocessor +#define EM_SCORE 135 // Sunplus Score +#define EM_SCORE7 135 // Sunplus S+core7 RISC processor +#define EM_DSP24 136 // New Japan Radio (NJR) 24-bit DSP Processor +#define EM_VIDEOCORE3 137 // Broadcom VideoCore III processor +#define EM_LATTICEMICO32 138 // RISC processor for Lattice FPGA architecture +#define EM_SE_C17 139 // Seiko Epson C17 family +#define EM_TI_C6000 140 // Texas Instruments TMS320C6000 DSP family +#define EM_TI_C2000 141 // Texas Instruments TMS320C2000 DSP family +#define EM_TI_C5500 142 // Texas Instruments TMS320C55x DSP family +#define EM_res143 143 // Reserved +#define EM_res144 144 // Reserved +#define EM_res145 145 // Reserved +#define EM_res146 146 // Reserved +#define EM_res147 147 // Reserved +#define EM_res148 148 // Reserved +#define EM_res149 149 // Reserved +#define EM_res150 150 // Reserved +#define EM_res151 151 // Reserved +#define EM_res152 152 // Reserved +#define EM_res153 153 // Reserved +#define EM_res154 154 // Reserved +#define EM_res155 155 // Reserved +#define EM_res156 156 // Reserved +#define EM_res157 157 // Reserved +#define EM_res158 158 // Reserved +#define EM_res159 159 // Reserved +#define EM_MMDSP_PLUS 160 // STMicroelectronics 64bit VLIW Data Signal Processor +#define EM_CYPRESS_M8C 161 // Cypress M8C microprocessor +#define EM_R32C 162 // Renesas R32C series microprocessors +#define EM_TRIMEDIA 163 // NXP Semiconductors TriMedia architecture family +#define EM_QDSP6 164 // QUALCOMM DSP6 Processor +#define EM_8051 165 // Intel 8051 and variants +#define EM_STXP7X 166 // STMicroelectronics STxP7x family +#define EM_NDS32 \ + 167 // Andes Technology compact code size embedded RISC processor family +#define EM_ECOG1 168 // Cyan Technology eCOG1X family +#define EM_ECOG1X 168 // Cyan Technology eCOG1X family +#define EM_MAXQ30 169 // Dallas Semiconductor MAXQ30 Core Micro-controllers +#define EM_XIMO16 170 // New Japan Radio (NJR) 16-bit DSP Processor +#define EM_MANIK 171 // M2000 Reconfigurable RISC Microprocessor +#define EM_CRAYNV2 172 // Cray Inc. NV2 vector architecture +#define EM_RX 173 // Renesas RX family +#define EM_METAG 174 // Imagination Technologies META processor architecture +#define EM_MCST_ELBRUS 175 // MCST Elbrus general purpose hardware architecture +#define EM_ECOG16 176 // Cyan Technology eCOG16 family +#define EM_CR16 177 // National Semiconductor CompactRISC 16-bit processor +#define EM_ETPU 178 // Freescale Extended Time Processing Unit +#define EM_SLE9X 179 // Infineon Technologies SLE9X core +#define EM_L1OM 180 // Intel L1OM +#define EM_INTEL181 181 // Reserved by Intel +#define EM_INTEL182 182 // Reserved by Intel +#define EM_res183 183 // Reserved by ARM +#define EM_res184 184 // Reserved by ARM +#define EM_AVR32 185 // Atmel Corporation 32-bit microprocessor family +#define EM_STM8 186 // STMicroeletronics STM8 8-bit microcontroller +#define EM_TILE64 187 // Tilera TILE64 multicore architecture family +#define EM_TILEPRO 188 // Tilera TILEPro multicore architecture family +#define EM_MICROBLAZE 189 // Xilinx MicroBlaze 32-bit RISC soft processor core +#define EM_CUDA 190 // NVIDIA CUDA architecture +#define EM_TILEGX 191 // Tilera TILE-Gx multicore architecture family +#define EM_CLOUDSHIELD 192 // CloudShield architecture family +#define EM_COREA_1ST 193 // KIPO-KAIST Core-A 1st generation processor family +#define EM_COREA_2ND 194 // KIPO-KAIST Core-A 2nd generation processor family +#define EM_ARC_COMPACT2 195 // Synopsys ARCompact V2 +#define EM_OPEN8 196 // Open8 8-bit RISC soft processor core +#define EM_RL78 197 // Renesas RL78 family +#define EM_VIDEOCORE5 198 // Broadcom VideoCore V processor +#define EM_78KOR 199 // Renesas 78KOR family +#define EM_56800EX 200 // Freescale 56800EX Digital Signal Controller (DSC) +#define EM_BA1 201 // Beyond BA1 CPU architecture +#define EM_BA2 202 // Beyond BA2 CPU architecture +#define EM_XCORE 203 // XMOS xCORE processor family +#define EM_MCHP_PIC 204 // Microchip 8-bit PIC(r) family +#define EM_INTEL205 205 // Reserved by Intel +#define EM_INTEL206 206 // Reserved by Intel +#define EM_INTEL207 207 // Reserved by Intel +#define EM_INTEL208 208 // Reserved by Intel +#define EM_INTEL209 209 // Reserved by Intel +#define EM_KM32 210 // KM211 KM32 32-bit processor +#define EM_KMX32 211 // KM211 KMX32 32-bit processor +#define EM_KMX16 212 // KM211 KMX16 16-bit processor +#define EM_KMX8 213 // KM211 KMX8 8-bit processor +#define EM_KVARC 214 // KM211 KVARC processor +#define EM_CDP 215 // Paneve CDP architecture family +#define EM_COGE 216 // Cognitive Smart Memory Processor +#define EM_COOL 217 // iCelero CoolEngine +#define EM_NORC 218 // Nanoradio Optimized RISC +#define EM_CSR_KALIMBA 219 // CSR Kalimba architecture family +#define EM_Z80 220 // Zilog Z80 +#define EM_VISIUM 221 // Controls and Data Services VISIUMcore processor +#define EM_FT32 222 // FTDI Chip FT32 high performance 32-bit RISC architecture +#define EM_MOXIE 223 // Moxie processor family +#define EM_AMDGPU 224 // AMD GPU architecture +#define EM_RISCV 243 // RISC-V +#define EM_LANAI 244 // Lanai processor +#define EM_CEVA 245 // CEVA Processor Architecture Family +#define EM_CEVA_X2 246 // CEVA X2 Processor Family +#define EM_BPF 247 // Linux BPF – in-kernel virtual machine +#define EM_GRAPHCORE_IPU 248 // Graphcore Intelligent Processing Unit +#define EM_IMG1 249 // Imagination Technologies +#define EM_NFP 250 // Netronome Flow Processor (P) +#define EM_CSKY 252 // C-SKY processor family + +// File version +#define EV_NONE 0 +#define EV_CURRENT 1 + +// Identification index +#define EI_MAG0 0 +#define EI_MAG1 1 +#define EI_MAG2 2 +#define EI_MAG3 3 +#define EI_CLASS 4 +#define EI_DATA 5 +#define EI_VERSION 6 +#define EI_OSABI 7 +#define EI_ABIVERSION 8 +#define EI_PAD 9 +#define EI_NIDENT 16 + +// Magic number +#define ELFMAG0 0x7F +#define ELFMAG1 'E' +#define ELFMAG2 'L' +#define ELFMAG3 'F' + +// File class +#define ELFCLASSNONE 0 +#define ELFCLASS32 1 +#define ELFCLASS64 2 + +// Encoding +#define ELFDATANONE 0 +#define ELFDATA2LSB 1 +#define ELFDATA2MSB 2 + +// OS extensions +#define ELFOSABI_NONE 0 // No extensions or unspecified +#define ELFOSABI_HPUX 1 // Hewlett-Packard HP-UX +#define ELFOSABI_NETBSD 2 // NetBSD +#define ELFOSABI_LINUX 3 // Linux +#define ELFOSABI_SOLARIS 6 // Sun Solaris +#define ELFOSABI_AIX 7 // AIX +#define ELFOSABI_IRIX 8 // IRIX +#define ELFOSABI_FREEBSD 9 // FreeBSD +#define ELFOSABI_TRU64 10 // Compaq TRU64 UNIX +#define ELFOSABI_MODESTO 11 // Novell Modesto +#define ELFOSABI_OPENBSD 12 // Open BSD +#define ELFOSABI_OPENVMS 13 // Open VMS +#define ELFOSABI_NSK 14 // Hewlett-Packard Non-Stop Kernel +#define ELFOSABI_AROS 15 // Amiga Research OS +#define ELFOSABI_FENIXOS 16 // The FenixOS highly scalable multi-core OS +// 64-255 Architecture-specific value range +#define ELFOSABI_AMDGPU_HSA \ + 64 // AMDGPU OS for HSA compatible compute // kernels. +#define ELFOSABI_AMDGPU_PAL \ + 65 // AMDGPU OS for AMD PAL compatible graphics // shaders and compute kernels. +#define ELFOSABI_AMDGPU_MESA3D \ + 66 // AMDGPU OS for Mesa3D compatible graphics // shaders and compute kernels. + +// AMDGPU specific e_flags +#define EF_AMDGPU_MACH 0x0ff // AMDGPU processor selection mask. +#define EF_AMDGPU_XNACK \ + 0x100 // Indicates if the XNACK target feature is // enabled for all code contained in the ELF. +// AMDGPU processors +#define EF_AMDGPU_MACH_NONE 0x000 // Unspecified processor. +#define EF_AMDGPU_MACH_R600_R600 0x001 +#define EF_AMDGPU_MACH_R600_R630 0x002 +#define EF_AMDGPU_MACH_R600_RS880 0x003 +#define EF_AMDGPU_MACH_R600_RV670 0x004 +#define EF_AMDGPU_MACH_R600_RV710 0x005 +#define EF_AMDGPU_MACH_R600_RV730 0x006 +#define EF_AMDGPU_MACH_R600_RV770 0x007 +#define EF_AMDGPU_MACH_R600_CEDAR 0x008 +#define EF_AMDGPU_MACH_R600_CYPRESS 0x009 +#define EF_AMDGPU_MACH_R600_JUNIPER 0x00a +#define EF_AMDGPU_MACH_R600_REDWOOD 0x00b +#define EF_AMDGPU_MACH_R600_SUMO 0x00c +#define EF_AMDGPU_MACH_R600_BARTS 0x00d +#define EF_AMDGPU_MACH_R600_CAICOS 0x00e +#define EF_AMDGPU_MACH_R600_CAYMAN 0x00f +#define EF_AMDGPU_MACH_R600_TURKS 0x010 +#define EF_AMDGPU_MACH_R600_RESERVED_FIRST 0x011 +#define EF_AMDGPU_MACH_R600_RESERVED_LAST 0x01f +#define EF_AMDGPU_MACH_R600_FIRST EF_AMDGPU_MACH_R600_R600 +#define EF_AMDGPU_MACH_R600_LAST EF_AMDGPU_MACH_R600_TURKS +#define EF_AMDGPU_MACH_AMDGCN_GFX600 0x020 +#define EF_AMDGPU_MACH_AMDGCN_GFX601 0x021 +#define EF_AMDGPU_MACH_AMDGCN_GFX700 0x022 +#define EF_AMDGPU_MACH_AMDGCN_GFX701 0x023 +#define EF_AMDGPU_MACH_AMDGCN_GFX702 0x024 +#define EF_AMDGPU_MACH_AMDGCN_GFX703 0x025 +#define EF_AMDGPU_MACH_AMDGCN_GFX704 0x026 +#define EF_AMDGPU_MACH_AMDGCN_GFX801 0x028 +#define EF_AMDGPU_MACH_AMDGCN_GFX802 0x029 +#define EF_AMDGPU_MACH_AMDGCN_GFX803 0x02a +#define EF_AMDGPU_MACH_AMDGCN_GFX810 0x02b +#define EF_AMDGPU_MACH_AMDGCN_GFX900 0x02c +#define EF_AMDGPU_MACH_AMDGCN_GFX902 0x02d +#define EF_AMDGPU_MACH_AMDGCN_GFX904 0x02e +#define EF_AMDGPU_MACH_AMDGCN_GFX906 0x02f +#define EF_AMDGPU_MACH_AMDGCN_RESERVED0 0x027 +#define EF_AMDGPU_MACH_AMDGCN_RESERVED1 0x030 +#define EF_AMDGPU_MACH_AMDGCN_FIRST EF_AMDGPU_MACH_AMDGCN_GFX600 +#define EF_AMDGPU_MACH_AMDGCN_LAST EF_AMDGPU_MACH_AMDGCN_GFX906 + +///////////////////// +// Sections constants + +// Section indexes +#define SHN_UNDEF 0 +#define SHN_LORESERVE 0xFF00 +#define SHN_LOPROC 0xFF00 +#define SHN_HIPROC 0xFF1F +#define SHN_LOOS 0xFF20 +#define SHN_HIOS 0xFF3F +#define SHN_ABS 0xFFF1 +#define SHN_COMMON 0xFFF2 +#define SHN_XINDEX 0xFFFF +#define SHN_HIRESERVE 0xFFFF + +// Section types +#define SHT_NULL 0 +#define SHT_PROGBITS 1 +#define SHT_SYMTAB 2 +#define SHT_STRTAB 3 +#define SHT_RELA 4 +#define SHT_HASH 5 +#define SHT_DYNAMIC 6 +#define SHT_NOTE 7 +#define SHT_NOBITS 8 +#define SHT_REL 9 +#define SHT_SHLIB 10 +#define SHT_DYNSYM 11 +#define SHT_INIT_ARRAY 14 +#define SHT_FINI_ARRAY 15 +#define SHT_PREINIT_ARRAY 16 +#define SHT_GROUP 17 +#define SHT_SYMTAB_SHNDX 18 +#define SHT_LOOS 0x60000000 +#define SHT_HIOS 0x6fffffff +#define SHT_LOPROC 0x70000000 +#define SHT_HIPROC 0x7FFFFFFF +#define SHT_LOUSER 0x80000000 +#define SHT_HIUSER 0xFFFFFFFF + +// Section attribute flags +#define SHF_WRITE 0x1 +#define SHF_ALLOC 0x2 +#define SHF_EXECINSTR 0x4 +#define SHF_MERGE 0x10 +#define SHF_STRINGS 0x20 +#define SHF_INFO_LINK 0x40 +#define SHF_LINK_ORDER 0x80 +#define SHF_OS_NONCONFORMING 0x100 +#define SHF_GROUP 0x200 +#define SHF_TLS 0x400 +#define SHF_MASKOS 0x0ff00000 +#define SHF_MASKPROC 0xF0000000 + +// Section group flags +#define GRP_COMDAT 0x1 +#define GRP_MASKOS 0x0ff00000 +#define GRP_MASKPROC 0xf0000000 + +// Symbol binding +#define STB_LOCAL 0 +#define STB_GLOBAL 1 +#define STB_WEAK 2 +#define STB_LOOS 10 +#define STB_HIOS 12 +#define STB_MULTIDEF 13 +#define STB_LOPROC 13 +#define STB_HIPROC 15 + +// Note types +#define NT_AMDGPU_METADATA 1 +#define NT_AMD_AMDGPU_HSA_METADATA 10 +#define NT_AMD_AMDGPU_ISA 11 +#define NT_AMD_AMDGPU_PAL_METADATA 12 + +// Symbol types +#define STT_NOTYPE 0 +#define STT_OBJECT 1 +#define STT_FUNC 2 +#define STT_SECTION 3 +#define STT_FILE 4 +#define STT_COMMON 5 +#define STT_TLS 6 +#define STT_LOOS 10 +#define STT_AMDGPU_HSA_KERNEL 10 +#define STT_HIOS 12 +#define STT_LOPROC 13 +#define STT_HIPROC 15 + +// Symbol visibility +#define STV_DEFAULT 0 +#define STV_INTERNAL 1 +#define STV_HIDDEN 2 +#define STV_PROTECTED 3 + +// Undefined name +#define STN_UNDEF 0 + +// Relocation types +#define R_386_NONE 0 +#define R_X86_64_NONE 0 +#define R_AMDGPU_NONE 0 +#define R_386_32 1 +#define R_X86_64_64 1 +#define R_AMDGPU_ABS32_LO 1 +#define R_386_PC32 2 +#define R_X86_64_PC32 2 +#define R_AMDGPU_ABS32_HI 2 +#define R_386_GOT32 3 +#define R_X86_64_GOT32 3 +#define R_AMDGPU_ABS64 3 +#define R_386_PLT32 4 +#define R_X86_64_PLT32 4 +#define R_AMDGPU_REL32 4 +#define R_386_COPY 5 +#define R_X86_64_COPY 5 +#define R_AMDGPU_REL64 5 +#define R_386_GLOB_DAT 6 +#define R_X86_64_GLOB_DAT 6 +#define R_AMDGPU_ABS32 6 +#define R_386_JMP_SLOT 7 +#define R_X86_64_JUMP_SLOT 7 +#define R_AMDGPU_GOTPCREL 7 +#define R_386_RELATIVE 8 +#define R_X86_64_RELATIVE 8 +#define R_AMDGPU_GOTPCREL32_LO 8 +#define R_386_GOTOFF 9 +#define R_X86_64_GOTPCREL 9 +#define R_AMDGPU_GOTPCREL32_HI 9 +#define R_386_GOTPC 10 +#define R_X86_64_32 10 +#define R_AMDGPU_REL32_LO 10 +#define R_386_32PLT 11 +#define R_X86_64_32S 11 +#define R_AMDGPU_REL32_HI 11 +#define R_X86_64_16 12 +#define R_X86_64_PC16 13 +#define R_AMDGPU_RELATIVE64 13 +#define R_386_TLS_TPOFF 14 +#define R_X86_64_8 14 +#define R_386_TLS_IE 15 +#define R_X86_64_PC8 15 +#define R_386_TLS_GOTIE 16 +#define R_X86_64_DTPMOD64 16 +#define R_386_TLS_LE 17 +#define R_X86_64_DTPOFF64 17 +#define R_386_TLS_GD 18 +#define R_X86_64_TPOFF64 18 +#define R_386_TLS_LDM 19 +#define R_X86_64_TLSGD 19 +#define R_386_16 20 +#define R_X86_64_TLSLD 20 +#define R_386_PC16 21 +#define R_X86_64_DTPOFF32 21 +#define R_386_8 22 +#define R_X86_64_GOTTPOFF 22 +#define R_386_PC8 23 +#define R_X86_64_TPOFF32 23 +#define R_386_TLS_GD_32 24 +#define R_X86_64_PC64 24 +#define R_386_TLS_GD_PUSH 25 +#define R_X86_64_GOTOFF64 25 +#define R_386_TLS_GD_CALL 26 +#define R_X86_64_GOTPC32 26 +#define R_386_TLS_GD_POP 27 +#define R_X86_64_GOT64 27 +#define R_386_TLS_LDM_32 28 +#define R_X86_64_GOTPCREL64 28 +#define R_386_TLS_LDM_PUSH 29 +#define R_X86_64_GOTPC64 29 +#define R_386_TLS_LDM_CALL 30 +#define R_X86_64_GOTPLT64 30 +#define R_386_TLS_LDM_POP 31 +#define R_X86_64_PLTOFF64 31 +#define R_386_TLS_LDO_32 32 +#define R_386_TLS_IE_32 33 +#define R_386_TLS_LE_32 34 +#define R_X86_64_GOTPC32_TLSDESC 34 +#define R_386_TLS_DTPMOD32 35 +#define R_X86_64_TLSDESC_CALL 35 +#define R_386_TLS_DTPOFF32 36 +#define R_X86_64_TLSDESC 36 +#define R_386_TLS_TPOFF32 37 +#define R_X86_64_IRELATIVE 37 +#define R_386_SIZE32 38 +#define R_386_TLS_GOTDESC 39 +#define R_386_TLS_DESC_CALL 40 +#define R_386_TLS_DESC 41 +#define R_386_IRELATIVE 42 +#define R_386_GOT32X 43 +#define R_X86_64_GNU_VTINHERIT 250 +#define R_X86_64_GNU_VTENTRY 251 + +// Segment types +#define PT_NULL 0 +#define PT_LOAD 1 +#define PT_DYNAMIC 2 +#define PT_INTERP 3 +#define PT_NOTE 4 +#define PT_SHLIB 5 +#define PT_PHDR 6 +#define PT_TLS 7 +#define PT_LOOS 0x60000000 +#define PT_HIOS 0x6fffffff +#define PT_LOPROC 0x70000000 +#define PT_HIPROC 0x7FFFFFFF + +// Segment flags +#define PF_X 1 // Execute +#define PF_W 2 // Write +#define PF_R 4 // Read +#define PF_MASKOS 0x0ff00000 // Unspecified +#define PF_MASKPROC 0xf0000000 // Unspecified + +// Dynamic Array Tags +#define DT_NULL 0 +#define DT_NEEDED 1 +#define DT_PLTRELSZ 2 +#define DT_PLTGOT 3 +#define DT_HASH 4 +#define DT_STRTAB 5 +#define DT_SYMTAB 6 +#define DT_RELA 7 +#define DT_RELASZ 8 +#define DT_RELAENT 9 +#define DT_STRSZ 10 +#define DT_SYMENT 11 +#define DT_INIT 12 +#define DT_FINI 13 +#define DT_SONAME 14 +#define DT_RPATH 15 +#define DT_SYMBOLIC 16 +#define DT_REL 17 +#define DT_RELSZ 18 +#define DT_RELENT 19 +#define DT_PLTREL 20 +#define DT_DEBUG 21 +#define DT_TEXTREL 22 +#define DT_JMPREL 23 +#define DT_BIND_NOW 24 +#define DT_INIT_ARRAY 25 +#define DT_FINI_ARRAY 26 +#define DT_INIT_ARRAYSZ 27 +#define DT_FINI_ARRAYSZ 28 +#define DT_RUNPATH 29 +#define DT_FLAGS 30 +#define DT_ENCODING 32 +#define DT_PREINIT_ARRAY 32 +#define DT_PREINIT_ARRAYSZ 33 +#define DT_MAXPOSTAGS 34 +#define DT_LOOS 0x6000000D +#define DT_HIOS 0x6ffff000 +#define DT_LOPROC 0x70000000 +#define DT_HIPROC 0x7FFFFFFF + +// DT_FLAGS values +#define DF_ORIGIN 0x1 +#define DF_SYMBOLIC 0x2 +#define DF_TEXTREL 0x4 +#define DF_BIND_NOW 0x8 +#define DF_STATIC_TLS 0x10 + +// ELF file header +struct Elf32_Ehdr +{ + unsigned char e_ident[EI_NIDENT]; + Elf_Half e_type; + Elf_Half e_machine; + Elf_Word e_version; + Elf32_Addr e_entry; + Elf32_Off e_phoff; + Elf32_Off e_shoff; + Elf_Word e_flags; + Elf_Half e_ehsize; + Elf_Half e_phentsize; + Elf_Half e_phnum; + Elf_Half e_shentsize; + Elf_Half e_shnum; + Elf_Half e_shstrndx; +}; + +struct Elf64_Ehdr +{ + unsigned char e_ident[EI_NIDENT]; + Elf_Half e_type; + Elf_Half e_machine; + Elf_Word e_version; + Elf64_Addr e_entry; + Elf64_Off e_phoff; + Elf64_Off e_shoff; + Elf_Word e_flags; + Elf_Half e_ehsize; + Elf_Half e_phentsize; + Elf_Half e_phnum; + Elf_Half e_shentsize; + Elf_Half e_shnum; + Elf_Half e_shstrndx; +}; + +// Section header +struct Elf32_Shdr +{ + Elf_Word sh_name; + Elf_Word sh_type; + Elf_Word sh_flags; + Elf32_Addr sh_addr; + Elf32_Off sh_offset; + Elf_Word sh_size; + Elf_Word sh_link; + Elf_Word sh_info; + Elf_Word sh_addralign; + Elf_Word sh_entsize; +}; + +struct Elf64_Shdr +{ + Elf_Word sh_name; + Elf_Word sh_type; + Elf_Xword sh_flags; + Elf64_Addr sh_addr; + Elf64_Off sh_offset; + Elf_Xword sh_size; + Elf_Word sh_link; + Elf_Word sh_info; + Elf_Xword sh_addralign; + Elf_Xword sh_entsize; +}; + +// Segment header +struct Elf32_Phdr +{ + Elf_Word p_type; + Elf32_Off p_offset; + Elf32_Addr p_vaddr; + Elf32_Addr p_paddr; + Elf_Word p_filesz; + Elf_Word p_memsz; + Elf_Word p_flags; + Elf_Word p_align; +}; + +struct Elf64_Phdr +{ + Elf_Word p_type; + Elf_Word p_flags; + Elf64_Off p_offset; + Elf64_Addr p_vaddr; + Elf64_Addr p_paddr; + Elf_Xword p_filesz; + Elf_Xword p_memsz; + Elf_Xword p_align; +}; + +// Symbol table entry +struct Elf32_Sym +{ + Elf_Word st_name; + Elf32_Addr st_value; + Elf_Word st_size; + unsigned char st_info; + unsigned char st_other; + Elf_Half st_shndx; +}; + +struct Elf64_Sym +{ + Elf_Word st_name; + unsigned char st_info; + unsigned char st_other; + Elf_Half st_shndx; + Elf64_Addr st_value; + Elf_Xword st_size; +}; + +#define ELF_ST_BIND( i ) ( ( i ) >> 4 ) +#define ELF_ST_TYPE( i ) ( (i)&0xf ) +#define ELF_ST_INFO( b, t ) ( ( ( b ) << 4 ) + ( (t)&0xf ) ) + +#define ELF_ST_VISIBILITY( o ) ( (o)&0x3 ) + +// Relocation entries +struct Elf32_Rel +{ + Elf32_Addr r_offset; + Elf_Word r_info; +}; + +struct Elf32_Rela +{ + Elf32_Addr r_offset; + Elf_Word r_info; + Elf_Sword r_addend; +}; + +struct Elf64_Rel +{ + Elf64_Addr r_offset; + Elf_Xword r_info; +}; + +struct Elf64_Rela +{ + Elf64_Addr r_offset; + Elf_Xword r_info; + Elf_Sxword r_addend; +}; + +#define ELF32_R_SYM( i ) ( ( i ) >> 8 ) +#define ELF32_R_TYPE( i ) ( (unsigned char)( i ) ) +#define ELF32_R_INFO( s, t ) ( ( ( s ) << 8 ) + (unsigned char)( t ) ) + +#define ELF64_R_SYM( i ) ( ( i ) >> 32 ) +#define ELF64_R_TYPE( i ) ( (i)&0xffffffffL ) +#define ELF64_R_INFO( s, t ) \ + ( ( ( ( int64_t )( s ) ) << 32 ) + ( (t)&0xffffffffL ) ) + +// Dynamic structure +struct Elf32_Dyn +{ + Elf_Sword d_tag; + union { + Elf_Word d_val; + Elf32_Addr d_ptr; + } d_un; +}; + +struct Elf64_Dyn +{ + Elf_Sxword d_tag; + union { + Elf_Xword d_val; + Elf64_Addr d_ptr; + } d_un; +}; + +} // namespace ELFIO + +#endif // ELFTYPES_H + +/*** End of inlined file: elf_types.hpp ***/ + + +/*** Start of inlined file: elfio_version.hpp ***/ +#define ELFIO_VERSION "3.8" + +/*** End of inlined file: elfio_version.hpp ***/ + + +/*** Start of inlined file: elfio_utils.hpp ***/ +#ifndef ELFIO_UTILS_HPP +#define ELFIO_UTILS_HPP + +#define ELFIO_GET_ACCESS( TYPE, NAME, FIELD ) \ + TYPE get_##NAME() const { return ( *convertor )( FIELD ); } +#define ELFIO_SET_ACCESS( TYPE, NAME, FIELD ) \ + void set_##NAME( TYPE value ) \ + { \ + FIELD = value; \ + FIELD = ( *convertor )( FIELD ); \ + } +#define ELFIO_GET_SET_ACCESS( TYPE, NAME, FIELD ) \ + TYPE get_##NAME() const { return ( *convertor )( FIELD ); } \ + void set_##NAME( TYPE value ) \ + { \ + FIELD = value; \ + FIELD = ( *convertor )( FIELD ); \ + } + +#define ELFIO_GET_ACCESS_DECL( TYPE, NAME ) virtual TYPE get_##NAME() const = 0 + +#define ELFIO_SET_ACCESS_DECL( TYPE, NAME ) \ + virtual void set_##NAME( TYPE value ) = 0 + +#define ELFIO_GET_SET_ACCESS_DECL( TYPE, NAME ) \ + virtual TYPE get_##NAME() const = 0; \ + virtual void set_##NAME( TYPE value ) = 0 + +namespace ELFIO { + +//------------------------------------------------------------------------------ +class endianess_convertor +{ + public: + //------------------------------------------------------------------------------ + endianess_convertor() { need_conversion = false; } + + //------------------------------------------------------------------------------ + void setup( unsigned char elf_file_encoding ) + { + need_conversion = ( elf_file_encoding != get_host_encoding() ); + } + + //------------------------------------------------------------------------------ + uint64_t operator()( uint64_t value ) const + { + if ( !need_conversion ) { + return value; + } + value = ( ( value & 0x00000000000000FFull ) << 56 ) | + ( ( value & 0x000000000000FF00ull ) << 40 ) | + ( ( value & 0x0000000000FF0000ull ) << 24 ) | + ( ( value & 0x00000000FF000000ull ) << 8 ) | + ( ( value & 0x000000FF00000000ull ) >> 8 ) | + ( ( value & 0x0000FF0000000000ull ) >> 24 ) | + ( ( value & 0x00FF000000000000ull ) >> 40 ) | + ( ( value & 0xFF00000000000000ull ) >> 56 ); + + return value; + } + + //------------------------------------------------------------------------------ + int64_t operator()( int64_t value ) const + { + if ( !need_conversion ) { + return value; + } + return ( int64_t )( *this )( (uint64_t)value ); + } + + //------------------------------------------------------------------------------ + uint32_t operator()( uint32_t value ) const + { + if ( !need_conversion ) { + return value; + } + value = + ( ( value & 0x000000FF ) << 24 ) | ( ( value & 0x0000FF00 ) << 8 ) | + ( ( value & 0x00FF0000 ) >> 8 ) | ( ( value & 0xFF000000 ) >> 24 ); + + return value; + } + + //------------------------------------------------------------------------------ + int32_t operator()( int32_t value ) const + { + if ( !need_conversion ) { + return value; + } + return ( int32_t )( *this )( (uint32_t)value ); + } + + //------------------------------------------------------------------------------ + uint16_t operator()( uint16_t value ) const + { + if ( !need_conversion ) { + return value; + } + value = ( ( value & 0x00FF ) << 8 ) | ( ( value & 0xFF00 ) >> 8 ); + + return value; + } + + //------------------------------------------------------------------------------ + int16_t operator()( int16_t value ) const + { + if ( !need_conversion ) { + return value; + } + return ( int16_t )( *this )( (uint16_t)value ); + } + + //------------------------------------------------------------------------------ + int8_t operator()( int8_t value ) const { return value; } + + //------------------------------------------------------------------------------ + uint8_t operator()( uint8_t value ) const { return value; } + + //------------------------------------------------------------------------------ + private: + //------------------------------------------------------------------------------ + unsigned char get_host_encoding() const + { + static const int tmp = 1; + if ( 1 == *(const char*)&tmp ) { + return ELFDATA2LSB; + } + else { + return ELFDATA2MSB; + } + } + + //------------------------------------------------------------------------------ + private: + bool need_conversion; +}; + +//------------------------------------------------------------------------------ +inline uint32_t elf_hash( const unsigned char* name ) +{ + uint32_t h = 0, g; + while ( *name ) { + h = ( h << 4 ) + *name++; + g = h & 0xf0000000; + if ( g != 0 ) + h ^= g >> 24; + h &= ~g; + } + return h; +} + +} // namespace ELFIO + +#endif // ELFIO_UTILS_HPP + +/*** End of inlined file: elfio_utils.hpp ***/ + + +/*** Start of inlined file: elfio_header.hpp ***/ +#ifndef ELF_HEADER_HPP +#define ELF_HEADER_HPP + +#include + +namespace ELFIO { + +class elf_header +{ + public: + virtual ~elf_header(){}; + virtual bool load( std::istream& stream ) = 0; + virtual bool save( std::ostream& stream ) const = 0; + + // ELF header functions + ELFIO_GET_ACCESS_DECL( unsigned char, class ); + ELFIO_GET_ACCESS_DECL( unsigned char, elf_version ); + ELFIO_GET_ACCESS_DECL( unsigned char, encoding ); + ELFIO_GET_ACCESS_DECL( Elf_Half, header_size ); + ELFIO_GET_ACCESS_DECL( Elf_Half, section_entry_size ); + ELFIO_GET_ACCESS_DECL( Elf_Half, segment_entry_size ); + + ELFIO_GET_SET_ACCESS_DECL( Elf_Word, version ); + ELFIO_GET_SET_ACCESS_DECL( unsigned char, os_abi ); + ELFIO_GET_SET_ACCESS_DECL( unsigned char, abi_version ); + ELFIO_GET_SET_ACCESS_DECL( Elf_Half, type ); + ELFIO_GET_SET_ACCESS_DECL( Elf_Half, machine ); + ELFIO_GET_SET_ACCESS_DECL( Elf_Word, flags ); + ELFIO_GET_SET_ACCESS_DECL( Elf64_Addr, entry ); + ELFIO_GET_SET_ACCESS_DECL( Elf_Half, sections_num ); + ELFIO_GET_SET_ACCESS_DECL( Elf64_Off, sections_offset ); + ELFIO_GET_SET_ACCESS_DECL( Elf_Half, segments_num ); + ELFIO_GET_SET_ACCESS_DECL( Elf64_Off, segments_offset ); + ELFIO_GET_SET_ACCESS_DECL( Elf_Half, section_name_str_index ); +}; + +template struct elf_header_impl_types; +template <> struct elf_header_impl_types +{ + typedef Elf32_Phdr Phdr_type; + typedef Elf32_Shdr Shdr_type; + static const unsigned char file_class = ELFCLASS32; +}; +template <> struct elf_header_impl_types +{ + typedef Elf64_Phdr Phdr_type; + typedef Elf64_Shdr Shdr_type; + static const unsigned char file_class = ELFCLASS64; +}; + +template class elf_header_impl : public elf_header +{ + public: + //------------------------------------------------------------------------------ + elf_header_impl( endianess_convertor* convertor_, unsigned char encoding ) + { + convertor = convertor_; + + std::fill_n( reinterpret_cast( &header ), sizeof( header ), + '\0' ); + + header.e_ident[EI_MAG0] = ELFMAG0; + header.e_ident[EI_MAG1] = ELFMAG1; + header.e_ident[EI_MAG2] = ELFMAG2; + header.e_ident[EI_MAG3] = ELFMAG3; + header.e_ident[EI_CLASS] = elf_header_impl_types::file_class; + header.e_ident[EI_DATA] = encoding; + header.e_ident[EI_VERSION] = EV_CURRENT; + header.e_version = ( *convertor )( (Elf_Word)EV_CURRENT ); + header.e_ehsize = ( sizeof( header ) ); + header.e_ehsize = ( *convertor )( header.e_ehsize ); + header.e_shstrndx = ( *convertor )( (Elf_Half)1 ); + header.e_phentsize = + sizeof( typename elf_header_impl_types::Phdr_type ); + header.e_shentsize = + sizeof( typename elf_header_impl_types::Shdr_type ); + header.e_phentsize = ( *convertor )( header.e_phentsize ); + header.e_shentsize = ( *convertor )( header.e_shentsize ); + } + + //------------------------------------------------------------------------------ + bool load( std::istream& stream ) + { + stream.seekg( 0 ); + stream.read( reinterpret_cast( &header ), sizeof( header ) ); + + return ( stream.gcount() == sizeof( header ) ); + } + + //------------------------------------------------------------------------------ + bool save( std::ostream& stream ) const + { + stream.seekp( 0 ); + stream.write( reinterpret_cast( &header ), + sizeof( header ) ); + + return stream.good(); + } + + //------------------------------------------------------------------------------ + // ELF header functions + ELFIO_GET_ACCESS( unsigned char, class, header.e_ident[EI_CLASS] ); + ELFIO_GET_ACCESS( unsigned char, elf_version, header.e_ident[EI_VERSION] ); + ELFIO_GET_ACCESS( unsigned char, encoding, header.e_ident[EI_DATA] ); + ELFIO_GET_ACCESS( Elf_Half, header_size, header.e_ehsize ); + ELFIO_GET_ACCESS( Elf_Half, section_entry_size, header.e_shentsize ); + ELFIO_GET_ACCESS( Elf_Half, segment_entry_size, header.e_phentsize ); + + ELFIO_GET_SET_ACCESS( Elf_Word, version, header.e_version ); + ELFIO_GET_SET_ACCESS( unsigned char, os_abi, header.e_ident[EI_OSABI] ); + ELFIO_GET_SET_ACCESS( unsigned char, + abi_version, + header.e_ident[EI_ABIVERSION] ); + ELFIO_GET_SET_ACCESS( Elf_Half, type, header.e_type ); + ELFIO_GET_SET_ACCESS( Elf_Half, machine, header.e_machine ); + ELFIO_GET_SET_ACCESS( Elf_Word, flags, header.e_flags ); + ELFIO_GET_SET_ACCESS( Elf_Half, section_name_str_index, header.e_shstrndx ); + ELFIO_GET_SET_ACCESS( Elf64_Addr, entry, header.e_entry ); + ELFIO_GET_SET_ACCESS( Elf_Half, sections_num, header.e_shnum ); + ELFIO_GET_SET_ACCESS( Elf64_Off, sections_offset, header.e_shoff ); + ELFIO_GET_SET_ACCESS( Elf_Half, segments_num, header.e_phnum ); + ELFIO_GET_SET_ACCESS( Elf64_Off, segments_offset, header.e_phoff ); + + private: + T header; + endianess_convertor* convertor; +}; + +} // namespace ELFIO + +#endif // ELF_HEADER_HPP + +/*** End of inlined file: elfio_header.hpp ***/ + + +/*** Start of inlined file: elfio_section.hpp ***/ +#ifndef ELFIO_SECTION_HPP +#define ELFIO_SECTION_HPP + +#include +#include +#include + +namespace ELFIO { + +class section +{ + friend class elfio; + + public: + virtual ~section(){}; + + ELFIO_GET_ACCESS_DECL( Elf_Half, index ); + ELFIO_GET_SET_ACCESS_DECL( std::string, name ); + ELFIO_GET_SET_ACCESS_DECL( Elf_Word, type ); + ELFIO_GET_SET_ACCESS_DECL( Elf_Xword, flags ); + ELFIO_GET_SET_ACCESS_DECL( Elf_Word, info ); + ELFIO_GET_SET_ACCESS_DECL( Elf_Word, link ); + ELFIO_GET_SET_ACCESS_DECL( Elf_Xword, addr_align ); + ELFIO_GET_SET_ACCESS_DECL( Elf_Xword, entry_size ); + ELFIO_GET_SET_ACCESS_DECL( Elf64_Addr, address ); + ELFIO_GET_SET_ACCESS_DECL( Elf_Xword, size ); + ELFIO_GET_SET_ACCESS_DECL( Elf_Word, name_string_offset ); + ELFIO_GET_ACCESS_DECL( Elf64_Off, offset ); + + virtual const char* get_data() const = 0; + virtual void set_data( const char* pData, Elf_Word size ) = 0; + virtual void set_data( const std::string& data ) = 0; + virtual void append_data( const char* pData, Elf_Word size ) = 0; + virtual void append_data( const std::string& data ) = 0; + virtual size_t get_stream_size() const = 0; + virtual void set_stream_size( size_t value ) = 0; + + protected: + ELFIO_SET_ACCESS_DECL( Elf64_Off, offset ); + ELFIO_SET_ACCESS_DECL( Elf_Half, index ); + + virtual void load( std::istream& stream, std::streampos header_offset ) = 0; + virtual void save( std::ostream& stream, + std::streampos header_offset, + std::streampos data_offset ) = 0; + virtual bool is_address_initialized() const = 0; +}; + +template class section_impl : public section +{ + public: + //------------------------------------------------------------------------------ + section_impl( const endianess_convertor* convertor_ ) + : convertor( convertor_ ) + { + std::fill_n( reinterpret_cast( &header ), sizeof( header ), + '\0' ); + is_address_set = false; + data = 0; + data_size = 0; + index = 0; + stream_size = 0; + } + + //------------------------------------------------------------------------------ + ~section_impl() { delete[] data; } + + //------------------------------------------------------------------------------ + // Section info functions + ELFIO_GET_SET_ACCESS( Elf_Word, type, header.sh_type ); + ELFIO_GET_SET_ACCESS( Elf_Xword, flags, header.sh_flags ); + ELFIO_GET_SET_ACCESS( Elf_Xword, size, header.sh_size ); + ELFIO_GET_SET_ACCESS( Elf_Word, link, header.sh_link ); + ELFIO_GET_SET_ACCESS( Elf_Word, info, header.sh_info ); + ELFIO_GET_SET_ACCESS( Elf_Xword, addr_align, header.sh_addralign ); + ELFIO_GET_SET_ACCESS( Elf_Xword, entry_size, header.sh_entsize ); + ELFIO_GET_SET_ACCESS( Elf_Word, name_string_offset, header.sh_name ); + ELFIO_GET_ACCESS( Elf64_Addr, address, header.sh_addr ); + + //------------------------------------------------------------------------------ + Elf_Half get_index() const { return index; } + + //------------------------------------------------------------------------------ + std::string get_name() const { return name; } + + //------------------------------------------------------------------------------ + void set_name( std::string name_ ) { name = name_; } + + //------------------------------------------------------------------------------ + void set_address( Elf64_Addr value ) + { + header.sh_addr = value; + header.sh_addr = ( *convertor )( header.sh_addr ); + is_address_set = true; + } + + //------------------------------------------------------------------------------ + bool is_address_initialized() const { return is_address_set; } + + //------------------------------------------------------------------------------ + const char* get_data() const { return data; } + + //------------------------------------------------------------------------------ + void set_data( const char* raw_data, Elf_Word size ) + { + if ( get_type() != SHT_NOBITS ) { + delete[] data; + data = new ( std::nothrow ) char[size]; + if ( 0 != data && 0 != raw_data ) { + data_size = size; + std::copy( raw_data, raw_data + size, data ); + } + else { + data_size = 0; + } + } + + set_size( data_size ); + } + + //------------------------------------------------------------------------------ + void set_data( const std::string& str_data ) + { + return set_data( str_data.c_str(), (Elf_Word)str_data.size() ); + } + + //------------------------------------------------------------------------------ + void append_data( const char* raw_data, Elf_Word size ) + { + if ( get_type() != SHT_NOBITS ) { + if ( get_size() + size < data_size ) { + std::copy( raw_data, raw_data + size, data + get_size() ); + } + else { + data_size = 2 * ( data_size + size ); + char* new_data = new ( std::nothrow ) char[data_size]; + + if ( 0 != new_data ) { + std::copy( data, data + get_size(), new_data ); + std::copy( raw_data, raw_data + size, + new_data + get_size() ); + delete[] data; + data = new_data; + } + else { + size = 0; + } + } + set_size( get_size() + size ); + } + } + + //------------------------------------------------------------------------------ + void append_data( const std::string& str_data ) + { + return append_data( str_data.c_str(), (Elf_Word)str_data.size() ); + } + + //------------------------------------------------------------------------------ + protected: + //------------------------------------------------------------------------------ + ELFIO_GET_SET_ACCESS( Elf64_Off, offset, header.sh_offset ); + + //------------------------------------------------------------------------------ + void set_index( Elf_Half value ) { index = value; } + + //------------------------------------------------------------------------------ + void load( std::istream& stream, std::streampos header_offset ) + { + std::fill_n( reinterpret_cast( &header ), sizeof( header ), + '\0' ); + + stream.seekg( 0, stream.end ); + set_stream_size( stream.tellg() ); + + stream.seekg( header_offset ); + stream.read( reinterpret_cast( &header ), sizeof( header ) ); + + Elf_Xword size = get_size(); + if ( 0 == data && SHT_NULL != get_type() && SHT_NOBITS != get_type() && + size < get_stream_size() ) { + data = new ( std::nothrow ) char[size + 1]; + + if ( ( 0 != size ) && ( 0 != data ) ) { + stream.seekg( ( *convertor )( header.sh_offset ) ); + stream.read( data, size ); + data[size] = 0; // Ensure data is ended with 0 to avoid oob read + data_size = size; + } + else { + data_size = 0; + } + } + } + + //------------------------------------------------------------------------------ + void save( std::ostream& stream, + std::streampos header_offset, + std::streampos data_offset ) + { + if ( 0 != get_index() ) { + header.sh_offset = data_offset; + header.sh_offset = ( *convertor )( header.sh_offset ); + } + + save_header( stream, header_offset ); + if ( get_type() != SHT_NOBITS && get_type() != SHT_NULL && + get_size() != 0 && data != 0 ) { + save_data( stream, data_offset ); + } + } + + //------------------------------------------------------------------------------ + private: + //------------------------------------------------------------------------------ + void save_header( std::ostream& stream, std::streampos header_offset ) const + { + stream.seekp( header_offset ); + stream.write( reinterpret_cast( &header ), + sizeof( header ) ); + } + + //------------------------------------------------------------------------------ + void save_data( std::ostream& stream, std::streampos data_offset ) const + { + stream.seekp( data_offset ); + stream.write( get_data(), get_size() ); + } + + //------------------------------------------------------------------------------ + size_t get_stream_size() const { return stream_size; } + + //------------------------------------------------------------------------------ + void set_stream_size( size_t value ) { stream_size = value; } + + //------------------------------------------------------------------------------ + private: + T header; + Elf_Half index; + std::string name; + char* data; + Elf_Word data_size; + const endianess_convertor* convertor; + bool is_address_set; + size_t stream_size; +}; + +} // namespace ELFIO + +#endif // ELFIO_SECTION_HPP + +/*** End of inlined file: elfio_section.hpp ***/ + + +/*** Start of inlined file: elfio_segment.hpp ***/ +#ifndef ELFIO_SEGMENT_HPP +#define ELFIO_SEGMENT_HPP + +#include +#include +#include + +namespace ELFIO { + +class segment +{ + friend class elfio; + + public: + virtual ~segment(){}; + + ELFIO_GET_ACCESS_DECL( Elf_Half, index ); + ELFIO_GET_SET_ACCESS_DECL( Elf_Word, type ); + ELFIO_GET_SET_ACCESS_DECL( Elf_Word, flags ); + ELFIO_GET_SET_ACCESS_DECL( Elf_Xword, align ); + ELFIO_GET_SET_ACCESS_DECL( Elf64_Addr, virtual_address ); + ELFIO_GET_SET_ACCESS_DECL( Elf64_Addr, physical_address ); + ELFIO_GET_SET_ACCESS_DECL( Elf_Xword, file_size ); + ELFIO_GET_SET_ACCESS_DECL( Elf_Xword, memory_size ); + ELFIO_GET_ACCESS_DECL( Elf64_Off, offset ); + + virtual const char* get_data() const = 0; + + virtual Elf_Half add_section_index( Elf_Half index, + Elf_Xword addr_align ) = 0; + virtual Elf_Half get_sections_num() const = 0; + virtual Elf_Half get_section_index_at( Elf_Half num ) const = 0; + virtual bool is_offset_initialized() const = 0; + + protected: + ELFIO_SET_ACCESS_DECL( Elf64_Off, offset ); + ELFIO_SET_ACCESS_DECL( Elf_Half, index ); + + virtual const std::vector& get_sections() const = 0; + virtual void load( std::istream& stream, std::streampos header_offset ) = 0; + virtual void save( std::ostream& stream, + std::streampos header_offset, + std::streampos data_offset ) = 0; +}; + +//------------------------------------------------------------------------------ +template class segment_impl : public segment +{ + public: + //------------------------------------------------------------------------------ + segment_impl( endianess_convertor* convertor_ ) + : stream_size( 0 ), index( 0 ), data( 0 ), convertor( convertor_ ) + { + is_offset_set = false; + std::fill_n( reinterpret_cast( &ph ), sizeof( ph ), '\0' ); + } + + //------------------------------------------------------------------------------ + virtual ~segment_impl() { delete[] data; } + + //------------------------------------------------------------------------------ + // Section info functions + ELFIO_GET_SET_ACCESS( Elf_Word, type, ph.p_type ); + ELFIO_GET_SET_ACCESS( Elf_Word, flags, ph.p_flags ); + ELFIO_GET_SET_ACCESS( Elf_Xword, align, ph.p_align ); + ELFIO_GET_SET_ACCESS( Elf64_Addr, virtual_address, ph.p_vaddr ); + ELFIO_GET_SET_ACCESS( Elf64_Addr, physical_address, ph.p_paddr ); + ELFIO_GET_SET_ACCESS( Elf_Xword, file_size, ph.p_filesz ); + ELFIO_GET_SET_ACCESS( Elf_Xword, memory_size, ph.p_memsz ); + ELFIO_GET_ACCESS( Elf64_Off, offset, ph.p_offset ); + size_t stream_size; + + //------------------------------------------------------------------------------ + size_t get_stream_size() const { return stream_size; } + + //------------------------------------------------------------------------------ + void set_stream_size( size_t value ) { stream_size = value; } + + //------------------------------------------------------------------------------ + Elf_Half get_index() const { return index; } + + //------------------------------------------------------------------------------ + const char* get_data() const { return data; } + + //------------------------------------------------------------------------------ + Elf_Half add_section_index( Elf_Half sec_index, Elf_Xword addr_align ) + { + sections.push_back( sec_index ); + if ( addr_align > get_align() ) { + set_align( addr_align ); + } + + return (Elf_Half)sections.size(); + } + + //------------------------------------------------------------------------------ + Elf_Half get_sections_num() const { return (Elf_Half)sections.size(); } + + //------------------------------------------------------------------------------ + Elf_Half get_section_index_at( Elf_Half num ) const + { + if ( num < sections.size() ) { + return sections[num]; + } + + return Elf_Half( -1 ); + } + + //------------------------------------------------------------------------------ + protected: + //------------------------------------------------------------------------------ + + //------------------------------------------------------------------------------ + void set_offset( Elf64_Off value ) + { + ph.p_offset = value; + ph.p_offset = ( *convertor )( ph.p_offset ); + is_offset_set = true; + } + + //------------------------------------------------------------------------------ + bool is_offset_initialized() const { return is_offset_set; } + + //------------------------------------------------------------------------------ + const std::vector& get_sections() const { return sections; } + + //------------------------------------------------------------------------------ + void set_index( Elf_Half value ) { index = value; } + + //------------------------------------------------------------------------------ + void load( std::istream& stream, std::streampos header_offset ) + { + + stream.seekg( 0, stream.end ); + set_stream_size( stream.tellg() ); + + stream.seekg( header_offset ); + stream.read( reinterpret_cast( &ph ), sizeof( ph ) ); + is_offset_set = true; + + if ( PT_NULL != get_type() && 0 != get_file_size() ) { + stream.seekg( ( *convertor )( ph.p_offset ) ); + Elf_Xword size = get_file_size(); + + if ( size > get_stream_size() ) { + data = 0; + } + else { + data = new (std::nothrow) char[size + 1]; + + if ( 0 != data ) { + stream.read( data, size ); + data[size] = 0; + } + } + } + } + + //------------------------------------------------------------------------------ + void save( std::ostream& stream, + std::streampos header_offset, + std::streampos data_offset ) + { + ph.p_offset = data_offset; + ph.p_offset = ( *convertor )( ph.p_offset ); + stream.seekp( header_offset ); + stream.write( reinterpret_cast( &ph ), sizeof( ph ) ); + } + + //------------------------------------------------------------------------------ + private: + T ph; + Elf_Half index; + char* data; + std::vector sections; + endianess_convertor* convertor; + bool is_offset_set; +}; + +} // namespace ELFIO + +#endif // ELFIO_SEGMENT_HPP + +/*** End of inlined file: elfio_segment.hpp ***/ + + +/*** Start of inlined file: elfio_strings.hpp ***/ +#ifndef ELFIO_STRINGS_HPP +#define ELFIO_STRINGS_HPP + +#include +#include +#include + +namespace ELFIO { + +//------------------------------------------------------------------------------ +template class string_section_accessor_template +{ + public: + //------------------------------------------------------------------------------ + string_section_accessor_template( S* section_ ) : string_section( section_ ) + { + } + + //------------------------------------------------------------------------------ + const char* get_string( Elf_Word index ) const + { + if ( string_section ) { + if ( index < string_section->get_size() ) { + const char* data = string_section->get_data(); + if ( 0 != data ) { + return data + index; + } + } + } + + return 0; + } + + //------------------------------------------------------------------------------ + Elf_Word add_string( const char* str ) + { + Elf_Word current_position = 0; + + if ( string_section ) { + // Strings are addeded to the end of the current section data + current_position = (Elf_Word)string_section->get_size(); + + if ( current_position == 0 ) { + char empty_string = '\0'; + string_section->append_data( &empty_string, 1 ); + current_position++; + } + string_section->append_data( str, + (Elf_Word)std::strlen( str ) + 1 ); + } + + return current_position; + } + + //------------------------------------------------------------------------------ + Elf_Word add_string( const std::string& str ) + { + return add_string( str.c_str() ); + } + + //------------------------------------------------------------------------------ + private: + S* string_section; +}; + +using string_section_accessor = string_section_accessor_template
; +using const_string_section_accessor = + string_section_accessor_template; + +} // namespace ELFIO + +#endif // ELFIO_STRINGS_HPP + +/*** End of inlined file: elfio_strings.hpp ***/ + +#define ELFIO_HEADER_ACCESS_GET( TYPE, FNAME ) \ + TYPE get_##FNAME() const { return header ? ( header->get_##FNAME() ) : 0; } + +#define ELFIO_HEADER_ACCESS_GET_SET( TYPE, FNAME ) \ + TYPE get_##FNAME() const \ + { \ + return header ? ( header->get_##FNAME() ) : 0; \ + } \ + void set_##FNAME( TYPE val ) \ + { \ + if ( header ) { \ + header->set_##FNAME( val ); \ + } \ + } + +namespace ELFIO { + +//------------------------------------------------------------------------------ +class elfio +{ + public: + //------------------------------------------------------------------------------ + elfio() : sections( this ), segments( this ) + { + header = 0; + current_file_pos = 0; + create( ELFCLASS32, ELFDATA2LSB ); + } + + //------------------------------------------------------------------------------ + ~elfio() { clean(); } + + //------------------------------------------------------------------------------ + void create( unsigned char file_class, unsigned char encoding ) + { + clean(); + convertor.setup( encoding ); + header = create_header( file_class, encoding ); + create_mandatory_sections(); + } + + //------------------------------------------------------------------------------ + bool load( const std::string& file_name ) + { + std::ifstream stream; + stream.open( file_name.c_str(), std::ios::in | std::ios::binary ); + if ( !stream ) { + return false; + } + + return load( stream ); + } + + //------------------------------------------------------------------------------ + bool load( std::istream& stream ) + { + clean(); + + unsigned char e_ident[EI_NIDENT]; + // Read ELF file signature + stream.read( reinterpret_cast( &e_ident ), sizeof( e_ident ) ); + + // Is it ELF file? + if ( stream.gcount() != sizeof( e_ident ) || + e_ident[EI_MAG0] != ELFMAG0 || e_ident[EI_MAG1] != ELFMAG1 || + e_ident[EI_MAG2] != ELFMAG2 || e_ident[EI_MAG3] != ELFMAG3 ) { + return false; + } + + if ( ( e_ident[EI_CLASS] != ELFCLASS64 ) && + ( e_ident[EI_CLASS] != ELFCLASS32 ) ) { + return false; + } + + convertor.setup( e_ident[EI_DATA] ); + header = create_header( e_ident[EI_CLASS], e_ident[EI_DATA] ); + if ( 0 == header ) { + return false; + } + if ( !header->load( stream ) ) { + return false; + } + + load_sections( stream ); + bool is_still_good = load_segments( stream ); + return is_still_good; + } + + //------------------------------------------------------------------------------ + bool save( const std::string& file_name ) + { + std::ofstream stream; + stream.open( file_name.c_str(), std::ios::out | std::ios::binary ); + if ( !stream ) { + return false; + } + + return save( stream ); + } + + //------------------------------------------------------------------------------ + bool save( std::ostream& stream ) + { + if ( !stream || !header ) { + return false; + } + + bool is_still_good = true; + // Define layout specific header fields + // The position of the segment table is fixed after the header. + // The position of the section table is variable and needs to be fixed + // before saving. + header->set_segments_num( segments.size() ); + header->set_segments_offset( segments.size() ? header->get_header_size() + : 0 ); + header->set_sections_num( sections.size() ); + header->set_sections_offset( 0 ); + + // Layout the first section right after the segment table + current_file_pos = header->get_header_size() + + header->get_segment_entry_size() * + (Elf_Xword)header->get_segments_num(); + + calc_segment_alignment(); + + is_still_good = layout_segments_and_their_sections(); + is_still_good = is_still_good && layout_sections_without_segments(); + is_still_good = is_still_good && layout_section_table(); + + is_still_good = is_still_good && save_header( stream ); + is_still_good = is_still_good && save_sections( stream ); + is_still_good = is_still_good && save_segments( stream ); + + return is_still_good; + } + + //------------------------------------------------------------------------------ + // ELF header access functions + ELFIO_HEADER_ACCESS_GET( unsigned char, class ); + ELFIO_HEADER_ACCESS_GET( unsigned char, elf_version ); + ELFIO_HEADER_ACCESS_GET( unsigned char, encoding ); + ELFIO_HEADER_ACCESS_GET( Elf_Word, version ); + ELFIO_HEADER_ACCESS_GET( Elf_Half, header_size ); + ELFIO_HEADER_ACCESS_GET( Elf_Half, section_entry_size ); + ELFIO_HEADER_ACCESS_GET( Elf_Half, segment_entry_size ); + + ELFIO_HEADER_ACCESS_GET_SET( unsigned char, os_abi ); + ELFIO_HEADER_ACCESS_GET_SET( unsigned char, abi_version ); + ELFIO_HEADER_ACCESS_GET_SET( Elf_Half, type ); + ELFIO_HEADER_ACCESS_GET_SET( Elf_Half, machine ); + ELFIO_HEADER_ACCESS_GET_SET( Elf_Word, flags ); + ELFIO_HEADER_ACCESS_GET_SET( Elf64_Addr, entry ); + ELFIO_HEADER_ACCESS_GET_SET( Elf64_Off, sections_offset ); + ELFIO_HEADER_ACCESS_GET_SET( Elf64_Off, segments_offset ); + ELFIO_HEADER_ACCESS_GET_SET( Elf_Half, section_name_str_index ); + + //------------------------------------------------------------------------------ + const endianess_convertor& get_convertor() const { return convertor; } + + //------------------------------------------------------------------------------ + Elf_Xword get_default_entry_size( Elf_Word section_type ) const + { + switch ( section_type ) { + case SHT_RELA: + if ( header->get_class() == ELFCLASS64 ) { + return sizeof( Elf64_Rela ); + } + else { + return sizeof( Elf32_Rela ); + } + case SHT_REL: + if ( header->get_class() == ELFCLASS64 ) { + return sizeof( Elf64_Rel ); + } + else { + return sizeof( Elf32_Rel ); + } + case SHT_SYMTAB: + if ( header->get_class() == ELFCLASS64 ) { + return sizeof( Elf64_Sym ); + } + else { + return sizeof( Elf32_Sym ); + } + case SHT_DYNAMIC: + if ( header->get_class() == ELFCLASS64 ) { + return sizeof( Elf64_Dyn ); + } + else { + return sizeof( Elf32_Dyn ); + } + default: + return 0; + } + } + + //------------------------------------------------------------------------------ + private: + bool is_offset_in_section( Elf64_Off offset, const section* sec ) const + { + return ( offset >= sec->get_offset() ) && + ( offset < ( sec->get_offset() + sec->get_size() ) ); + } + + //------------------------------------------------------------------------------ + public: + //! returns an empty string if no problems are detected, + //! or a string containing an error message if problems are found + std::string validate() const + { + + // check for overlapping sections in the file + for ( int i = 0; i < sections.size(); ++i ) { + for ( int j = i + 1; j < sections.size(); ++j ) { + const section* a = sections[i]; + const section* b = sections[j]; + if ( !( a->get_type() & SHT_NOBITS ) && + !( b->get_type() & SHT_NOBITS ) && ( a->get_size() > 0 ) && + ( b->get_size() > 0 ) && ( a->get_offset() > 0 ) && + ( b->get_offset() > 0 ) ) { + if ( is_offset_in_section( a->get_offset(), b ) || + is_offset_in_section( + a->get_offset() + a->get_size() - 1, b ) || + is_offset_in_section( b->get_offset(), a ) || + is_offset_in_section( + b->get_offset() + b->get_size() - 1, a ) ) { + return "Sections " + a->get_name() + " and " + + b->get_name() + " overlap in file"; + } + } + } + } + + // more checks to be added here... + + return ""; + } + + //------------------------------------------------------------------------------ + private: + //------------------------------------------------------------------------------ + void clean() + { + delete header; + header = 0; + + std::vector::const_iterator it; + for ( it = sections_.begin(); it != sections_.end(); ++it ) { + delete *it; + } + sections_.clear(); + + std::vector::const_iterator it1; + for ( it1 = segments_.begin(); it1 != segments_.end(); ++it1 ) { + delete *it1; + } + segments_.clear(); + } + + //------------------------------------------------------------------------------ + elf_header* create_header( unsigned char file_class, + unsigned char encoding ) + { + elf_header* new_header = 0; + + if ( file_class == ELFCLASS64 ) { + new_header = + new elf_header_impl( &convertor, encoding ); + } + else if ( file_class == ELFCLASS32 ) { + new_header = + new elf_header_impl( &convertor, encoding ); + } + else { + return 0; + } + + return new_header; + } + + //------------------------------------------------------------------------------ + section* create_section() + { + section* new_section; + unsigned char file_class = get_class(); + + if ( file_class == ELFCLASS64 ) { + new_section = new section_impl( &convertor ); + } + else if ( file_class == ELFCLASS32 ) { + new_section = new section_impl( &convertor ); + } + else { + return 0; + } + + new_section->set_index( (Elf_Half)sections_.size() ); + sections_.push_back( new_section ); + + return new_section; + } + + //------------------------------------------------------------------------------ + segment* create_segment() + { + segment* new_segment; + unsigned char file_class = header->get_class(); + + if ( file_class == ELFCLASS64 ) { + new_segment = new segment_impl( &convertor ); + } + else if ( file_class == ELFCLASS32 ) { + new_segment = new segment_impl( &convertor ); + } + else { + return 0; + } + + new_segment->set_index( (Elf_Half)segments_.size() ); + segments_.push_back( new_segment ); + + return new_segment; + } + + //------------------------------------------------------------------------------ + void create_mandatory_sections() + { + // Create null section without calling to 'add_section' as no string + // section containing section names exists yet + section* sec0 = create_section(); + sec0->set_index( 0 ); + sec0->set_name( "" ); + sec0->set_name_string_offset( 0 ); + + set_section_name_str_index( 1 ); + section* shstrtab = sections.add( ".shstrtab" ); + shstrtab->set_type( SHT_STRTAB ); + shstrtab->set_addr_align( 1 ); + } + + //------------------------------------------------------------------------------ + Elf_Half load_sections( std::istream& stream ) + { + Elf_Half entry_size = header->get_section_entry_size(); + Elf_Half num = header->get_sections_num(); + Elf64_Off offset = header->get_sections_offset(); + + for ( Elf_Half i = 0; i < num; ++i ) { + section* sec = create_section(); + sec->load( stream, (std::streamoff)offset + + (std::streampos)i * entry_size ); + sec->set_index( i ); + // To mark that the section is not permitted to reassign address + // during layout calculation + sec->set_address( sec->get_address() ); + } + + Elf_Half shstrndx = get_section_name_str_index(); + + if ( SHN_UNDEF != shstrndx ) { + string_section_accessor str_reader( sections[shstrndx] ); + for ( Elf_Half i = 0; i < num; ++i ) { + Elf_Word section_offset = sections[i]->get_name_string_offset(); + const char* p = str_reader.get_string( section_offset ); + if ( p != 0 ) { + sections[i]->set_name( p ); + } + } + } + + return num; + } + + //------------------------------------------------------------------------------ + //! Checks whether the addresses of the section entirely fall within the given segment. + //! It doesn't matter if the addresses are memory addresses, or file offsets, + //! they just need to be in the same address space + bool is_sect_in_seg( Elf64_Off sect_begin, + Elf_Xword sect_size, + Elf64_Off seg_begin, + Elf64_Off seg_end ) + { + return ( seg_begin <= sect_begin ) && + ( sect_begin + sect_size <= seg_end ) && + ( sect_begin < + seg_end ); // this is important criteria when sect_size == 0 + // Example: seg_begin=10, seg_end=12 (-> covering the bytes 10 and 11) + // sect_begin=12, sect_size=0 -> shall return false! + } + + //------------------------------------------------------------------------------ + bool load_segments( std::istream& stream ) + { + Elf_Half entry_size = header->get_segment_entry_size(); + Elf_Half num = header->get_segments_num(); + Elf64_Off offset = header->get_segments_offset(); + + for ( Elf_Half i = 0; i < num; ++i ) { + segment* seg; + unsigned char file_class = header->get_class(); + + if ( file_class == ELFCLASS64 ) { + seg = new segment_impl( &convertor ); + } + else if ( file_class == ELFCLASS32 ) { + seg = new segment_impl( &convertor ); + } + else { + return false; + } + + seg->load( stream, (std::streamoff)offset + + (std::streampos)i * entry_size ); + seg->set_index( i ); + + // Add sections to the segments (similar to readelfs algorithm) + Elf64_Off segBaseOffset = seg->get_offset(); + Elf64_Off segEndOffset = segBaseOffset + seg->get_file_size(); + Elf64_Off segVBaseAddr = seg->get_virtual_address(); + Elf64_Off segVEndAddr = segVBaseAddr + seg->get_memory_size(); + for ( Elf_Half j = 0; j < sections.size(); ++j ) { + const section* psec = sections[j]; + + // SHF_ALLOC sections are matched based on the virtual address + // otherwise the file offset is matched + if ( ( psec->get_flags() & SHF_ALLOC ) + ? is_sect_in_seg( psec->get_address(), + psec->get_size(), segVBaseAddr, + segVEndAddr ) + : is_sect_in_seg( psec->get_offset(), psec->get_size(), + segBaseOffset, segEndOffset ) ) { + // Alignment of segment shall not be updated, to preserve original value + // It will be re-calculated on saving. + seg->add_section_index( psec->get_index(), 0 ); + } + } + + // Add section into the segments' container + segments_.push_back( seg ); + } + + return true; + } + + //------------------------------------------------------------------------------ + bool save_header( std::ostream& stream ) { return header->save( stream ); } + + //------------------------------------------------------------------------------ + bool save_sections( std::ostream& stream ) + { + for ( unsigned int i = 0; i < sections_.size(); ++i ) { + section* sec = sections_.at( i ); + + std::streampos headerPosition = + (std::streamoff)header->get_sections_offset() + + (std::streampos)header->get_section_entry_size() * + sec->get_index(); + + sec->save( stream, headerPosition, sec->get_offset() ); + } + return true; + } + + //------------------------------------------------------------------------------ + bool save_segments( std::ostream& stream ) + { + for ( unsigned int i = 0; i < segments_.size(); ++i ) { + segment* seg = segments_.at( i ); + + std::streampos headerPosition = + header->get_segments_offset() + + (std::streampos)header->get_segment_entry_size() * + seg->get_index(); + + seg->save( stream, headerPosition, seg->get_offset() ); + } + return true; + } + + //------------------------------------------------------------------------------ + bool is_section_without_segment( unsigned int section_index ) + { + bool found = false; + + for ( unsigned int j = 0; !found && ( j < segments.size() ); ++j ) { + for ( unsigned int k = 0; + !found && ( k < segments[j]->get_sections_num() ); ++k ) { + found = segments[j]->get_section_index_at( k ) == section_index; + } + } + + return !found; + } + + //------------------------------------------------------------------------------ + bool is_subsequence_of( segment* seg1, segment* seg2 ) + { + // Return 'true' if sections of seg1 are a subset of sections in seg2 + const std::vector& sections1 = seg1->get_sections(); + const std::vector& sections2 = seg2->get_sections(); + + bool found = false; + if ( sections1.size() < sections2.size() ) { + found = std::includes( sections2.begin(), sections2.end(), + sections1.begin(), sections1.end() ); + } + + return found; + } + + //------------------------------------------------------------------------------ + std::vector get_ordered_segments() + { + std::vector res; + std::deque worklist; + + res.reserve( segments.size() ); + std::copy( segments_.begin(), segments_.end(), + std::back_inserter( worklist ) ); + + // Bring the segments which start at address 0 to the front + size_t nextSlot = 0; + for ( size_t i = 0; i < worklist.size(); ++i ) { + if ( i != nextSlot && worklist[i]->is_offset_initialized() && + worklist[i]->get_offset() == 0 ) { + if ( worklist[nextSlot]->get_offset() == 0 ) { + ++nextSlot; + } + std::swap( worklist[i], worklist[nextSlot] ); + ++nextSlot; + } + } + + while ( !worklist.empty() ) { + segment* seg = worklist.front(); + worklist.pop_front(); + + size_t i = 0; + for ( ; i < worklist.size(); ++i ) { + if ( is_subsequence_of( seg, worklist[i] ) ) { + break; + } + } + + if ( i < worklist.size() ) + worklist.push_back( seg ); + else + res.push_back( seg ); + } + + return res; + } + + //------------------------------------------------------------------------------ + bool layout_sections_without_segments() + { + for ( unsigned int i = 0; i < sections_.size(); ++i ) { + if ( is_section_without_segment( i ) ) { + section* sec = sections_[i]; + + Elf_Xword section_align = sec->get_addr_align(); + if ( section_align > 1 && + current_file_pos % section_align != 0 ) { + current_file_pos += + section_align - current_file_pos % section_align; + } + + if ( 0 != sec->get_index() ) + sec->set_offset( current_file_pos ); + + if ( SHT_NOBITS != sec->get_type() && + SHT_NULL != sec->get_type() ) { + current_file_pos += sec->get_size(); + } + } + } + + return true; + } + + //------------------------------------------------------------------------------ + void calc_segment_alignment() + { + for ( std::vector::iterator s = segments_.begin(); + s != segments_.end(); ++s ) { + segment* seg = *s; + for ( int i = 0; i < seg->get_sections_num(); ++i ) { + section* sect = sections_[seg->get_section_index_at( i )]; + if ( sect->get_addr_align() > seg->get_align() ) { + seg->set_align( sect->get_addr_align() ); + } + } + } + } + + //------------------------------------------------------------------------------ + bool layout_segments_and_their_sections() + { + std::vector worklist; + std::vector section_generated( sections.size(), false ); + + // Get segments in a order in where segments which contain a + // sub sequence of other segments are located at the end + worklist = get_ordered_segments(); + + for ( unsigned int i = 0; i < worklist.size(); ++i ) { + Elf_Xword segment_memory = 0; + Elf_Xword segment_filesize = 0; + Elf_Xword seg_start_pos = current_file_pos; + segment* seg = worklist[i]; + + // Special case: PHDR segment + // This segment contains the program headers but no sections + if ( seg->get_type() == PT_PHDR && seg->get_sections_num() == 0 ) { + seg_start_pos = header->get_segments_offset(); + segment_memory = segment_filesize = + header->get_segment_entry_size() * + (Elf_Xword)header->get_segments_num(); + } + // Special case: + else if ( seg->is_offset_initialized() && seg->get_offset() == 0 ) { + seg_start_pos = 0; + if ( seg->get_sections_num() ) { + segment_memory = segment_filesize = current_file_pos; + } + } + // New segments with not generated sections + // have to be aligned + else if ( seg->get_sections_num() && + !section_generated[seg->get_section_index_at( 0 )] ) { + Elf_Xword align = seg->get_align() > 0 ? seg->get_align() : 1; + Elf64_Off cur_page_alignment = current_file_pos % align; + Elf64_Off req_page_alignment = + seg->get_virtual_address() % align; + Elf64_Off error = req_page_alignment - cur_page_alignment; + + current_file_pos += ( seg->get_align() + error ) % align; + seg_start_pos = current_file_pos; + } + else if ( seg->get_sections_num() ) { + seg_start_pos = + sections[seg->get_section_index_at( 0 )]->get_offset(); + } + + // Write segment's data + for ( unsigned int j = 0; j < seg->get_sections_num(); ++j ) { + Elf_Half index = seg->get_section_index_at( j ); + + section* sec = sections[index]; + + // The NULL section is always generated + if ( SHT_NULL == sec->get_type() ) { + section_generated[index] = true; + continue; + } + + Elf_Xword secAlign = 0; + // Fix up the alignment + if ( !section_generated[index] && + sec->is_address_initialized() && + SHT_NOBITS != sec->get_type() && + SHT_NULL != sec->get_type() && 0 != sec->get_size() ) { + // Align the sections based on the virtual addresses + // when possible (this is what matters for execution) + Elf64_Off req_offset = + sec->get_address() - seg->get_virtual_address(); + Elf64_Off cur_offset = current_file_pos - seg_start_pos; + if ( req_offset < cur_offset ) { + // something has gone awfully wrong, abort! + // secAlign would turn out negative, seeking backwards and overwriting previous data + return false; + } + secAlign = req_offset - cur_offset; + } + else if ( !section_generated[index] && + !sec->is_address_initialized() ) { + // If no address has been specified then only the section + // alignment constraint has to be matched + Elf_Xword align = sec->get_addr_align(); + if ( align == 0 ) { + align = 1; + } + Elf64_Off error = current_file_pos % align; + secAlign = ( align - error ) % align; + } + else if ( section_generated[index] ) { + // Alignment for already generated sections + secAlign = + sec->get_offset() - seg_start_pos - segment_filesize; + } + + // Determine the segment file and memory sizes + // Special case .tbss section (NOBITS) in non TLS segment + if ( ( sec->get_flags() & SHF_ALLOC ) && + !( ( sec->get_flags() & SHF_TLS ) && + ( seg->get_type() != PT_TLS ) && + ( SHT_NOBITS == sec->get_type() ) ) ) + segment_memory += sec->get_size() + secAlign; + + if ( SHT_NOBITS != sec->get_type() ) + segment_filesize += sec->get_size() + secAlign; + + // Nothing to be done when generating nested segments + if ( section_generated[index] ) { + continue; + } + + current_file_pos += secAlign; + + // Set the section addresses when missing + if ( !sec->is_address_initialized() ) + sec->set_address( seg->get_virtual_address() + + current_file_pos - seg_start_pos ); + + if ( 0 != sec->get_index() ) + sec->set_offset( current_file_pos ); + + if ( SHT_NOBITS != sec->get_type() ) + current_file_pos += sec->get_size(); + + section_generated[index] = true; + } + + seg->set_file_size( segment_filesize ); + + // If we already have a memory size from loading an elf file (value > 0), + // it must not shrink! + // Memory size may be bigger than file size and it is the loader's job to do something + // with the surplus bytes in memory, like initializing them with a defined value. + if ( seg->get_memory_size() < segment_memory ) { + seg->set_memory_size( segment_memory ); + } + + seg->set_offset( seg_start_pos ); + } + + return true; + } + + //------------------------------------------------------------------------------ + bool layout_section_table() + { + // Simply place the section table at the end for now + Elf64_Off alignmentError = current_file_pos % 4; + current_file_pos += ( 4 - alignmentError ) % 4; + header->set_sections_offset( current_file_pos ); + return true; + } + + //------------------------------------------------------------------------------ + public: + friend class Sections; + class Sections + { + public: + //------------------------------------------------------------------------------ + Sections( elfio* parent_ ) : parent( parent_ ) {} + + //------------------------------------------------------------------------------ + Elf_Half size() const { return (Elf_Half)parent->sections_.size(); } + + //------------------------------------------------------------------------------ + section* operator[]( unsigned int index ) const + { + section* sec = 0; + + if ( index < parent->sections_.size() ) { + sec = parent->sections_[index]; + } + + return sec; + } + + //------------------------------------------------------------------------------ + section* operator[]( const std::string& name ) const + { + section* sec = 0; + + std::vector::const_iterator it; + for ( it = parent->sections_.begin(); it != parent->sections_.end(); + ++it ) { + if ( ( *it )->get_name() == name ) { + sec = *it; + break; + } + } + + return sec; + } + + //------------------------------------------------------------------------------ + section* add( const std::string& name ) + { + section* new_section = parent->create_section(); + new_section->set_name( name ); + + Elf_Half str_index = parent->get_section_name_str_index(); + section* string_table( parent->sections_[str_index] ); + string_section_accessor str_writer( string_table ); + Elf_Word pos = str_writer.add_string( name ); + new_section->set_name_string_offset( pos ); + + return new_section; + } + + //------------------------------------------------------------------------------ + std::vector::iterator begin() + { + return parent->sections_.begin(); + } + + //------------------------------------------------------------------------------ + std::vector::iterator end() + { + return parent->sections_.end(); + } + + //------------------------------------------------------------------------------ + std::vector::const_iterator begin() const + { + return parent->sections_.cbegin(); + } + + //------------------------------------------------------------------------------ + std::vector::const_iterator end() const + { + return parent->sections_.cend(); + } + + //------------------------------------------------------------------------------ + private: + elfio* parent; + } sections; + + //------------------------------------------------------------------------------ + public: + friend class Segments; + class Segments + { + public: + //------------------------------------------------------------------------------ + Segments( elfio* parent_ ) : parent( parent_ ) {} + + //------------------------------------------------------------------------------ + Elf_Half size() const { return (Elf_Half)parent->segments_.size(); } + + //------------------------------------------------------------------------------ + segment* operator[]( unsigned int index ) const + { + return parent->segments_[index]; + } + + //------------------------------------------------------------------------------ + segment* add() { return parent->create_segment(); } + + //------------------------------------------------------------------------------ + std::vector::iterator begin() + { + return parent->segments_.begin(); + } + + //------------------------------------------------------------------------------ + std::vector::iterator end() + { + return parent->segments_.end(); + } + + //------------------------------------------------------------------------------ + std::vector::const_iterator begin() const + { + return parent->segments_.cbegin(); + } + + //------------------------------------------------------------------------------ + std::vector::const_iterator end() const + { + return parent->segments_.cend(); + } + + //------------------------------------------------------------------------------ + private: + elfio* parent; + } segments; + + //------------------------------------------------------------------------------ + private: + elf_header* header; + std::vector sections_; + std::vector segments_; + endianess_convertor convertor; + + Elf_Xword current_file_pos; +}; + +} // namespace ELFIO + + +/*** Start of inlined file: elfio_symbols.hpp ***/ +#ifndef ELFIO_SYMBOLS_HPP +#define ELFIO_SYMBOLS_HPP + +namespace ELFIO { + +//------------------------------------------------------------------------------ +template class symbol_section_accessor_template +{ + public: + //------------------------------------------------------------------------------ + symbol_section_accessor_template( const elfio& elf_file_, + S* symbol_section_ ) + : elf_file( elf_file_ ), symbol_section( symbol_section_ ) + { + find_hash_section(); + } + + //------------------------------------------------------------------------------ + Elf_Xword get_symbols_num() const + { + Elf_Xword nRet = 0; + if ( 0 != symbol_section->get_entry_size() ) { + nRet = + symbol_section->get_size() / symbol_section->get_entry_size(); + } + + return nRet; + } + + //------------------------------------------------------------------------------ + bool get_symbol( Elf_Xword index, + std::string& name, + Elf64_Addr& value, + Elf_Xword& size, + unsigned char& bind, + unsigned char& type, + Elf_Half& section_index, + unsigned char& other ) const + { + bool ret = false; + + if ( elf_file.get_class() == ELFCLASS32 ) { + ret = generic_get_symbol( index, name, value, size, bind, + type, section_index, other ); + } + else { + ret = generic_get_symbol( index, name, value, size, bind, + type, section_index, other ); + } + + return ret; + } + + //------------------------------------------------------------------------------ + bool get_symbol( const std::string& name, + Elf64_Addr& value, + Elf_Xword& size, + unsigned char& bind, + unsigned char& type, + Elf_Half& section_index, + unsigned char& other ) const + { + bool ret = false; + + if ( 0 != get_hash_table_index() ) { + Elf_Word nbucket = *(const Elf_Word*)hash_section->get_data(); + Elf_Word nchain = *(const Elf_Word*)( hash_section->get_data() + + sizeof( Elf_Word ) ); + Elf_Word val = elf_hash( (const unsigned char*)name.c_str() ); + Elf_Word y = *(const Elf_Word*)( hash_section->get_data() + + ( 2 + val % nbucket ) * + sizeof( Elf_Word ) ); + std::string str; + get_symbol( y, str, value, size, bind, type, section_index, other ); + while ( str != name && STN_UNDEF != y && y < nchain ) { + y = *(const Elf_Word*)( hash_section->get_data() + + ( 2 + nbucket + y ) * + sizeof( Elf_Word ) ); + get_symbol( y, str, value, size, bind, type, section_index, + other ); + } + if ( str == name ) { + ret = true; + } + } + else { + for ( Elf_Xword i = 0; i < get_symbols_num() && !ret; i++ ) { + std::string symbol_name; + if ( get_symbol( i, symbol_name, value, size, bind, type, + section_index, other ) ) { + if ( symbol_name == name ) { + ret = true; + } + } + } + } + + return ret; + } + + //------------------------------------------------------------------------------ + bool get_symbol( const Elf64_Addr& value, + std::string& name, + Elf_Xword& size, + unsigned char& bind, + unsigned char& type, + Elf_Half& section_index, + unsigned char& other ) const + { + + const endianess_convertor& convertor = elf_file.get_convertor(); + + Elf_Xword idx = 0; + bool match = false; + Elf64_Addr v = 0; + + if ( elf_file.get_class() == ELFCLASS32 ) { + match = generic_search_symbols( + [&]( const Elf32_Sym* sym ) { + return convertor( sym->st_value ) == value; + }, + idx ); + } + else { + match = generic_search_symbols( + [&]( const Elf64_Sym* sym ) { + return convertor( sym->st_value ) == value; + }, + idx ); + } + + if ( match ) { + return get_symbol( idx, name, v, size, bind, type, section_index, + other ); + } + + return false; + } + + //------------------------------------------------------------------------------ + Elf_Word add_symbol( Elf_Word name, + Elf64_Addr value, + Elf_Xword size, + unsigned char info, + unsigned char other, + Elf_Half shndx ) + { + Elf_Word nRet; + + if ( symbol_section->get_size() == 0 ) { + if ( elf_file.get_class() == ELFCLASS32 ) { + nRet = generic_add_symbol( 0, 0, 0, 0, 0, 0 ); + } + else { + nRet = generic_add_symbol( 0, 0, 0, 0, 0, 0 ); + } + } + + if ( elf_file.get_class() == ELFCLASS32 ) { + nRet = generic_add_symbol( name, value, size, info, + other, shndx ); + } + else { + nRet = generic_add_symbol( name, value, size, info, + other, shndx ); + } + + return nRet; + } + + //------------------------------------------------------------------------------ + Elf_Word add_symbol( Elf_Word name, + Elf64_Addr value, + Elf_Xword size, + unsigned char bind, + unsigned char type, + unsigned char other, + Elf_Half shndx ) + { + return add_symbol( name, value, size, ELF_ST_INFO( bind, type ), other, + shndx ); + } + + //------------------------------------------------------------------------------ + Elf_Word add_symbol( string_section_accessor& pStrWriter, + const char* str, + Elf64_Addr value, + Elf_Xword size, + unsigned char info, + unsigned char other, + Elf_Half shndx ) + { + Elf_Word index = pStrWriter.add_string( str ); + return add_symbol( index, value, size, info, other, shndx ); + } + + //------------------------------------------------------------------------------ + Elf_Word add_symbol( string_section_accessor& pStrWriter, + const char* str, + Elf64_Addr value, + Elf_Xword size, + unsigned char bind, + unsigned char type, + unsigned char other, + Elf_Half shndx ) + { + return add_symbol( pStrWriter, str, value, size, + ELF_ST_INFO( bind, type ), other, shndx ); + } + + //------------------------------------------------------------------------------ + Elf_Xword arrange_local_symbols( + std::function func = + nullptr ) + { + int nRet = 0; + + if ( elf_file.get_class() == ELFCLASS32 ) { + nRet = generic_arrange_local_symbols( func ); + } + else { + nRet = generic_arrange_local_symbols( func ); + } + + return nRet; + } + + //------------------------------------------------------------------------------ + private: + //------------------------------------------------------------------------------ + void find_hash_section() + { + hash_section = 0; + hash_section_index = 0; + Elf_Half nSecNo = elf_file.sections.size(); + for ( Elf_Half i = 0; i < nSecNo && 0 == hash_section_index; ++i ) { + const section* sec = elf_file.sections[i]; + if ( sec->get_link() == symbol_section->get_index() ) { + hash_section = sec; + hash_section_index = i; + } + } + } + + //------------------------------------------------------------------------------ + Elf_Half get_string_table_index() const + { + return (Elf_Half)symbol_section->get_link(); + } + + //------------------------------------------------------------------------------ + Elf_Half get_hash_table_index() const { return hash_section_index; } + + //------------------------------------------------------------------------------ + template const T* generic_get_symbol_ptr( Elf_Xword index ) const + { + if ( 0 != symbol_section->get_data() && index < get_symbols_num() ) { + const T* pSym = reinterpret_cast( + symbol_section->get_data() + + index * symbol_section->get_entry_size() ); + + return pSym; + } + + return nullptr; + } + + //------------------------------------------------------------------------------ + template + bool generic_search_symbols( std::function match, + Elf_Xword& idx ) const + { + for ( Elf_Xword i = 0; i < get_symbols_num(); i++ ) { + const T* symPtr = generic_get_symbol_ptr( i ); + + if ( symPtr == nullptr ) + return false; + + if ( match( symPtr ) ) { + idx = i; + return true; + } + } + + return false; + } + + //------------------------------------------------------------------------------ + template + bool generic_get_symbol( Elf_Xword index, + std::string& name, + Elf64_Addr& value, + Elf_Xword& size, + unsigned char& bind, + unsigned char& type, + Elf_Half& section_index, + unsigned char& other ) const + { + bool ret = false; + + if ( 0 != symbol_section->get_data() && index < get_symbols_num() ) { + const T* pSym = reinterpret_cast( + symbol_section->get_data() + + index * symbol_section->get_entry_size() ); + + const endianess_convertor& convertor = elf_file.get_convertor(); + + section* string_section = + elf_file.sections[get_string_table_index()]; + string_section_accessor str_reader( string_section ); + const char* pStr = + str_reader.get_string( convertor( pSym->st_name ) ); + if ( 0 != pStr ) { + name = pStr; + } + value = convertor( pSym->st_value ); + size = convertor( pSym->st_size ); + bind = ELF_ST_BIND( pSym->st_info ); + type = ELF_ST_TYPE( pSym->st_info ); + section_index = convertor( pSym->st_shndx ); + other = pSym->st_other; + + ret = true; + } + + return ret; + } + + //------------------------------------------------------------------------------ + template + Elf_Word generic_add_symbol( Elf_Word name, + Elf64_Addr value, + Elf_Xword size, + unsigned char info, + unsigned char other, + Elf_Half shndx ) + { + const endianess_convertor& convertor = elf_file.get_convertor(); + + T entry; + entry.st_name = convertor( name ); + entry.st_value = value; + entry.st_value = convertor( entry.st_value ); + entry.st_size = size; + entry.st_size = convertor( entry.st_size ); + entry.st_info = convertor( info ); + entry.st_other = convertor( other ); + entry.st_shndx = convertor( shndx ); + + symbol_section->append_data( reinterpret_cast( &entry ), + sizeof( entry ) ); + + Elf_Word nRet = symbol_section->get_size() / sizeof( entry ) - 1; + + return nRet; + } + + //------------------------------------------------------------------------------ + template + Elf_Xword generic_arrange_local_symbols( + std::function func ) + { + const endianess_convertor& convertor = elf_file.get_convertor(); + const Elf_Xword size = symbol_section->get_entry_size(); + + Elf_Xword first_not_local = + 1; // Skip the first entry. It is always NOTYPE + Elf_Xword current = 0; + Elf_Xword count = get_symbols_num(); + + while ( true ) { + T* p1 = nullptr; + T* p2 = nullptr; + + while ( first_not_local < count ) { + p1 = const_cast( + generic_get_symbol_ptr( first_not_local ) ); + if ( ELF_ST_BIND( convertor( p1->st_info ) ) != STB_LOCAL ) + break; + ++first_not_local; + } + + current = first_not_local + 1; + while ( current < count ) { + p2 = const_cast( generic_get_symbol_ptr( current ) ); + if ( ELF_ST_BIND( convertor( p2->st_info ) ) == STB_LOCAL ) + break; + ++current; + } + + if ( first_not_local < count && current < count ) { + if ( func ) + func( first_not_local, current ); + + // Swap the symbols + T tmp; + std::copy( p1, p1 + 1, &tmp ); + std::copy( p2, p2 + 1, p1 ); + std::copy( &tmp, &tmp + 1, p2 ); + } + else { + // Update 'info' field of the section + symbol_section->set_info( first_not_local ); + break; + } + } + + // Elf_Word nRet = symbol_section->get_size() / sizeof(entry) - 1; + + return first_not_local; + } + + //------------------------------------------------------------------------------ + private: + const elfio& elf_file; + S* symbol_section; + Elf_Half hash_section_index; + const section* hash_section; +}; + +using symbol_section_accessor = symbol_section_accessor_template
; +using const_symbol_section_accessor = + symbol_section_accessor_template; + +} // namespace ELFIO + +#endif // ELFIO_SYMBOLS_HPP + +/*** End of inlined file: elfio_symbols.hpp ***/ + + +/*** Start of inlined file: elfio_note.hpp ***/ +#ifndef ELFIO_NOTE_HPP +#define ELFIO_NOTE_HPP + +namespace ELFIO { + +//------------------------------------------------------------------------------ +// There are discrepancies in documentations. SCO documentation +// (http://www.sco.com/developers/gabi/latest/ch5.pheader.html#note_section) +// requires 8 byte entries alignment for 64-bit ELF file, +// but Oracle's definition uses the same structure +// for 32-bit and 64-bit formats. +// (https://docs.oracle.com/cd/E23824_01/html/819-0690/chapter6-18048.html) +// +// It looks like EM_X86_64 Linux implementation is similar to Oracle's +// definition. Therefore, the same alignment works for both formats +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +template class note_section_accessor_template +{ + public: + //------------------------------------------------------------------------------ + note_section_accessor_template( const elfio& elf_file_, S* section_ ) + : elf_file( elf_file_ ), note_section( section_ ) + { + process_section(); + } + + //------------------------------------------------------------------------------ + Elf_Word get_notes_num() const + { + return (Elf_Word)note_start_positions.size(); + } + + //------------------------------------------------------------------------------ + bool get_note( Elf_Word index, + Elf_Word& type, + std::string& name, + void*& desc, + Elf_Word& descSize ) const + { + if ( index >= note_section->get_size() ) { + return false; + } + + const char* pData = + note_section->get_data() + note_start_positions[index]; + int align = sizeof( Elf_Word ); + + const endianess_convertor& convertor = elf_file.get_convertor(); + type = convertor( *(const Elf_Word*)( pData + 2 * align ) ); + Elf_Word namesz = convertor( *(const Elf_Word*)( pData ) ); + descSize = convertor( *(const Elf_Word*)( pData + sizeof( namesz ) ) ); + + Elf_Xword max_name_size = + note_section->get_size() - note_start_positions[index]; + if ( namesz < 1 || namesz > max_name_size || + (Elf_Xword)namesz + descSize > max_name_size ) { + return false; + } + name.assign( pData + 3 * align, namesz - 1 ); + if ( 0 == descSize ) { + desc = 0; + } + else { + desc = + const_cast( pData + 3 * align + + ( ( namesz + align - 1 ) / align ) * align ); + } + + return true; + } + + //------------------------------------------------------------------------------ + void add_note( Elf_Word type, + const std::string& name, + const void* desc, + Elf_Word descSize ) + { + const endianess_convertor& convertor = elf_file.get_convertor(); + + int align = sizeof( Elf_Word ); + Elf_Word nameLen = (Elf_Word)name.size() + 1; + Elf_Word nameLenConv = convertor( nameLen ); + std::string buffer( reinterpret_cast( &nameLenConv ), align ); + Elf_Word descSizeConv = convertor( descSize ); + + buffer.append( reinterpret_cast( &descSizeConv ), align ); + type = convertor( type ); + buffer.append( reinterpret_cast( &type ), align ); + buffer.append( name ); + buffer.append( 1, '\x00' ); + const char pad[] = { '\0', '\0', '\0', '\0' }; + if ( nameLen % align != 0 ) { + buffer.append( pad, align - nameLen % align ); + } + if ( desc != 0 && descSize != 0 ) { + buffer.append( reinterpret_cast( desc ), descSize ); + if ( descSize % align != 0 ) { + buffer.append( pad, align - descSize % align ); + } + } + + note_start_positions.push_back( note_section->get_size() ); + note_section->append_data( buffer ); + } + + private: + //------------------------------------------------------------------------------ + void process_section() + { + const endianess_convertor& convertor = elf_file.get_convertor(); + const char* data = note_section->get_data(); + Elf_Xword size = note_section->get_size(); + Elf_Xword current = 0; + + note_start_positions.clear(); + + // Is it empty? + if ( 0 == data || 0 == size ) { + return; + } + + Elf_Word align = sizeof( Elf_Word ); + while ( current + (Elf_Xword)3 * align <= size ) { + note_start_positions.push_back( current ); + Elf_Word namesz = convertor( *(const Elf_Word*)( data + current ) ); + Elf_Word descsz = convertor( + *(const Elf_Word*)( data + current + sizeof( namesz ) ) ); + + current += (Elf_Xword)3 * sizeof( Elf_Word ) + + ( ( namesz + align - 1 ) / align ) * (Elf_Xword)align + + ( ( descsz + align - 1 ) / align ) * (Elf_Xword)align; + } + } + + //------------------------------------------------------------------------------ + private: + const elfio& elf_file; + S* note_section; + std::vector note_start_positions; +}; + +using note_section_accessor = note_section_accessor_template
; +using const_note_section_accessor = + note_section_accessor_template; + +} // namespace ELFIO + +#endif // ELFIO_NOTE_HPP + +/*** End of inlined file: elfio_note.hpp ***/ + + +/*** Start of inlined file: elfio_relocation.hpp ***/ +#ifndef ELFIO_RELOCATION_HPP +#define ELFIO_RELOCATION_HPP + +namespace ELFIO { + +template struct get_sym_and_type; +template <> struct get_sym_and_type +{ + static int get_r_sym( Elf_Xword info ) + { + return ELF32_R_SYM( (Elf_Word)info ); + } + static int get_r_type( Elf_Xword info ) + { + return ELF32_R_TYPE( (Elf_Word)info ); + } +}; +template <> struct get_sym_and_type +{ + static int get_r_sym( Elf_Xword info ) + { + return ELF32_R_SYM( (Elf_Word)info ); + } + static int get_r_type( Elf_Xword info ) + { + return ELF32_R_TYPE( (Elf_Word)info ); + } +}; +template <> struct get_sym_and_type +{ + static int get_r_sym( Elf_Xword info ) { return ELF64_R_SYM( info ); } + static int get_r_type( Elf_Xword info ) { return ELF64_R_TYPE( info ); } +}; +template <> struct get_sym_and_type +{ + static int get_r_sym( Elf_Xword info ) { return ELF64_R_SYM( info ); } + static int get_r_type( Elf_Xword info ) { return ELF64_R_TYPE( info ); } +}; + +//------------------------------------------------------------------------------ +template class relocation_section_accessor_template +{ + public: + //------------------------------------------------------------------------------ + relocation_section_accessor_template( const elfio& elf_file_, S* section_ ) + : elf_file( elf_file_ ), relocation_section( section_ ) + { + } + + //------------------------------------------------------------------------------ + Elf_Xword get_entries_num() const + { + Elf_Xword nRet = 0; + + if ( 0 != relocation_section->get_entry_size() ) { + nRet = relocation_section->get_size() / + relocation_section->get_entry_size(); + } + + return nRet; + } + + //------------------------------------------------------------------------------ + bool get_entry( Elf_Xword index, + Elf64_Addr& offset, + Elf_Word& symbol, + Elf_Word& type, + Elf_Sxword& addend ) const + { + if ( index >= get_entries_num() ) { // Is index valid + return false; + } + + if ( elf_file.get_class() == ELFCLASS32 ) { + if ( SHT_REL == relocation_section->get_type() ) { + generic_get_entry_rel( index, offset, symbol, type, + addend ); + } + else if ( SHT_RELA == relocation_section->get_type() ) { + generic_get_entry_rela( index, offset, symbol, type, + addend ); + } + } + else { + if ( SHT_REL == relocation_section->get_type() ) { + generic_get_entry_rel( index, offset, symbol, type, + addend ); + } + else if ( SHT_RELA == relocation_section->get_type() ) { + generic_get_entry_rela( index, offset, symbol, type, + addend ); + } + } + + return true; + } + + //------------------------------------------------------------------------------ + bool get_entry( Elf_Xword index, + Elf64_Addr& offset, + Elf64_Addr& symbolValue, + std::string& symbolName, + Elf_Word& type, + Elf_Sxword& addend, + Elf_Sxword& calcValue ) const + { + // Do regular job + Elf_Word symbol; + bool ret = get_entry( index, offset, symbol, type, addend ); + + // Find the symbol + Elf_Xword size; + unsigned char bind; + unsigned char symbolType; + Elf_Half section; + unsigned char other; + + symbol_section_accessor symbols( + elf_file, elf_file.sections[get_symbol_table_index()] ); + ret = ret && symbols.get_symbol( symbol, symbolName, symbolValue, size, + bind, symbolType, section, other ); + + if ( ret ) { // Was it successful? + switch ( type ) { + case R_386_NONE: // none + calcValue = 0; + break; + case R_386_32: // S + A + calcValue = symbolValue + addend; + break; + case R_386_PC32: // S + A - P + calcValue = symbolValue + addend - offset; + break; + case R_386_GOT32: // G + A - P + calcValue = 0; + break; + case R_386_PLT32: // L + A - P + calcValue = 0; + break; + case R_386_COPY: // none + calcValue = 0; + break; + case R_386_GLOB_DAT: // S + case R_386_JMP_SLOT: // S + calcValue = symbolValue; + break; + case R_386_RELATIVE: // B + A + calcValue = addend; + break; + case R_386_GOTOFF: // S + A - GOT + calcValue = 0; + break; + case R_386_GOTPC: // GOT + A - P + calcValue = 0; + break; + default: // Not recognized symbol! + calcValue = 0; + break; + } + } + + return ret; + } + + //------------------------------------------------------------------------------ + bool set_entry( Elf_Xword index, + Elf64_Addr offset, + Elf_Word symbol, + Elf_Word type, + Elf_Sxword addend ) + { + if ( index >= get_entries_num() ) { // Is index valid + return false; + } + + if ( elf_file.get_class() == ELFCLASS32 ) { + if ( SHT_REL == relocation_section->get_type() ) { + generic_set_entry_rel( index, offset, symbol, type, + addend ); + } + else if ( SHT_RELA == relocation_section->get_type() ) { + generic_set_entry_rela( index, offset, symbol, type, + addend ); + } + } + else { + if ( SHT_REL == relocation_section->get_type() ) { + generic_set_entry_rel( index, offset, symbol, type, + addend ); + } + else if ( SHT_RELA == relocation_section->get_type() ) { + generic_set_entry_rela( index, offset, symbol, type, + addend ); + } + } + + return true; + } + + //------------------------------------------------------------------------------ + void add_entry( Elf64_Addr offset, Elf_Xword info ) + { + if ( elf_file.get_class() == ELFCLASS32 ) { + generic_add_entry( offset, info ); + } + else { + generic_add_entry( offset, info ); + } + } + + //------------------------------------------------------------------------------ + void add_entry( Elf64_Addr offset, Elf_Word symbol, unsigned char type ) + { + Elf_Xword info; + if ( elf_file.get_class() == ELFCLASS32 ) { + info = ELF32_R_INFO( (Elf_Xword)symbol, type ); + } + else { + info = ELF64_R_INFO( (Elf_Xword)symbol, type ); + } + + add_entry( offset, info ); + } + + //------------------------------------------------------------------------------ + void add_entry( Elf64_Addr offset, Elf_Xword info, Elf_Sxword addend ) + { + if ( elf_file.get_class() == ELFCLASS32 ) { + generic_add_entry( offset, info, addend ); + } + else { + generic_add_entry( offset, info, addend ); + } + } + + //------------------------------------------------------------------------------ + void add_entry( Elf64_Addr offset, + Elf_Word symbol, + unsigned char type, + Elf_Sxword addend ) + { + Elf_Xword info; + if ( elf_file.get_class() == ELFCLASS32 ) { + info = ELF32_R_INFO( (Elf_Xword)symbol, type ); + } + else { + info = ELF64_R_INFO( (Elf_Xword)symbol, type ); + } + + add_entry( offset, info, addend ); + } + + //------------------------------------------------------------------------------ + void add_entry( string_section_accessor str_writer, + const char* str, + symbol_section_accessor sym_writer, + Elf64_Addr value, + Elf_Word size, + unsigned char sym_info, + unsigned char other, + Elf_Half shndx, + Elf64_Addr offset, + unsigned char type ) + { + Elf_Word str_index = str_writer.add_string( str ); + Elf_Word sym_index = sym_writer.add_symbol( str_index, value, size, + sym_info, other, shndx ); + add_entry( offset, sym_index, type ); + } + + //------------------------------------------------------------------------------ + void swap_symbols( Elf_Xword first, Elf_Xword second ) + { + Elf64_Addr offset; + Elf_Word symbol; + Elf_Word rtype; + Elf_Sxword addend; + for ( Elf_Word i = 0; i < get_entries_num(); i++ ) { + get_entry( i, offset, symbol, rtype, addend ); + if ( symbol == first ) { + set_entry( i, offset, (Elf_Word)second, rtype, addend ); + } + if ( symbol == second ) { + set_entry( i, offset, (Elf_Word)first, rtype, addend ); + } + } + } + + //------------------------------------------------------------------------------ + private: + //------------------------------------------------------------------------------ + Elf_Half get_symbol_table_index() const + { + return (Elf_Half)relocation_section->get_link(); + } + + //------------------------------------------------------------------------------ + template + void generic_get_entry_rel( Elf_Xword index, + Elf64_Addr& offset, + Elf_Word& symbol, + Elf_Word& type, + Elf_Sxword& addend ) const + { + const endianess_convertor& convertor = elf_file.get_convertor(); + + const T* pEntry = reinterpret_cast( + relocation_section->get_data() + + index * relocation_section->get_entry_size() ); + offset = convertor( pEntry->r_offset ); + Elf_Xword tmp = convertor( pEntry->r_info ); + symbol = get_sym_and_type::get_r_sym( tmp ); + type = get_sym_and_type::get_r_type( tmp ); + addend = 0; + } + + //------------------------------------------------------------------------------ + template + void generic_get_entry_rela( Elf_Xword index, + Elf64_Addr& offset, + Elf_Word& symbol, + Elf_Word& type, + Elf_Sxword& addend ) const + { + const endianess_convertor& convertor = elf_file.get_convertor(); + + const T* pEntry = reinterpret_cast( + relocation_section->get_data() + + index * relocation_section->get_entry_size() ); + offset = convertor( pEntry->r_offset ); + Elf_Xword tmp = convertor( pEntry->r_info ); + symbol = get_sym_and_type::get_r_sym( tmp ); + type = get_sym_and_type::get_r_type( tmp ); + addend = convertor( pEntry->r_addend ); + } + + //------------------------------------------------------------------------------ + template + void generic_set_entry_rel( Elf_Xword index, + Elf64_Addr offset, + Elf_Word symbol, + Elf_Word type, + Elf_Sxword ) + { + const endianess_convertor& convertor = elf_file.get_convertor(); + + T* pEntry = const_cast( reinterpret_cast( + relocation_section->get_data() + + index * relocation_section->get_entry_size() ) ); + + if ( elf_file.get_class() == ELFCLASS32 ) { + pEntry->r_info = ELF32_R_INFO( (Elf_Xword)symbol, type ); + } + else { + pEntry->r_info = ELF64_R_INFO( (Elf_Xword)symbol, type ); + } + pEntry->r_offset = offset; + pEntry->r_offset = convertor( pEntry->r_offset ); + pEntry->r_info = convertor( pEntry->r_info ); + } + + //------------------------------------------------------------------------------ + template + void generic_set_entry_rela( Elf_Xword index, + Elf64_Addr offset, + Elf_Word symbol, + Elf_Word type, + Elf_Sxword addend ) + { + const endianess_convertor& convertor = elf_file.get_convertor(); + + T* pEntry = const_cast( reinterpret_cast( + relocation_section->get_data() + + index * relocation_section->get_entry_size() ) ); + + if ( elf_file.get_class() == ELFCLASS32 ) { + pEntry->r_info = ELF32_R_INFO( (Elf_Xword)symbol, type ); + } + else { + pEntry->r_info = ELF64_R_INFO( (Elf_Xword)symbol, type ); + } + pEntry->r_offset = offset; + pEntry->r_addend = addend; + pEntry->r_offset = convertor( pEntry->r_offset ); + pEntry->r_info = convertor( pEntry->r_info ); + pEntry->r_addend = convertor( pEntry->r_addend ); + } + + //------------------------------------------------------------------------------ + template + void generic_add_entry( Elf64_Addr offset, Elf_Xword info ) + { + const endianess_convertor& convertor = elf_file.get_convertor(); + + T entry; + entry.r_offset = offset; + entry.r_info = info; + entry.r_offset = convertor( entry.r_offset ); + entry.r_info = convertor( entry.r_info ); + + relocation_section->append_data( reinterpret_cast( &entry ), + sizeof( entry ) ); + } + + //------------------------------------------------------------------------------ + template + void + generic_add_entry( Elf64_Addr offset, Elf_Xword info, Elf_Sxword addend ) + { + const endianess_convertor& convertor = elf_file.get_convertor(); + + T entry; + entry.r_offset = offset; + entry.r_info = info; + entry.r_addend = addend; + entry.r_offset = convertor( entry.r_offset ); + entry.r_info = convertor( entry.r_info ); + entry.r_addend = convertor( entry.r_addend ); + + relocation_section->append_data( reinterpret_cast( &entry ), + sizeof( entry ) ); + } + + //------------------------------------------------------------------------------ + private: + const elfio& elf_file; + S* relocation_section; +}; + +using relocation_section_accessor = + relocation_section_accessor_template
; +using const_relocation_section_accessor = + relocation_section_accessor_template; + +} // namespace ELFIO + +#endif // ELFIO_RELOCATION_HPP + +/*** End of inlined file: elfio_relocation.hpp ***/ + + +/*** Start of inlined file: elfio_dynamic.hpp ***/ +#ifndef ELFIO_DYNAMIC_HPP +#define ELFIO_DYNAMIC_HPP + +namespace ELFIO { + +//------------------------------------------------------------------------------ +template class dynamic_section_accessor_template +{ + public: + //------------------------------------------------------------------------------ + dynamic_section_accessor_template( const elfio& elf_file_, S* section_ ) + : elf_file( elf_file_ ), dynamic_section( section_ ) + { + } + + //------------------------------------------------------------------------------ + Elf_Xword get_entries_num() const + { + Elf_Xword nRet = 0; + + if ( 0 != dynamic_section->get_entry_size() ) { + nRet = + dynamic_section->get_size() / dynamic_section->get_entry_size(); + } + + return nRet; + } + + //------------------------------------------------------------------------------ + bool get_entry( Elf_Xword index, + Elf_Xword& tag, + Elf_Xword& value, + std::string& str ) const + { + if ( index >= get_entries_num() ) { // Is index valid + return false; + } + + if ( elf_file.get_class() == ELFCLASS32 ) { + generic_get_entry_dyn( index, tag, value ); + } + else { + generic_get_entry_dyn( index, tag, value ); + } + + // If the tag may have a string table reference, prepare the string + if ( tag == DT_NEEDED || tag == DT_SONAME || tag == DT_RPATH || + tag == DT_RUNPATH ) { + string_section_accessor strsec = + elf_file.sections[get_string_table_index()]; + const char* result = strsec.get_string( value ); + if ( 0 == result ) { + str.clear(); + return false; + } + str = result; + } + else { + str.clear(); + } + + return true; + } + + //------------------------------------------------------------------------------ + void add_entry( Elf_Xword tag, Elf_Xword value ) + { + if ( elf_file.get_class() == ELFCLASS32 ) { + generic_add_entry( tag, value ); + } + else { + generic_add_entry( tag, value ); + } + } + + //------------------------------------------------------------------------------ + void add_entry( Elf_Xword tag, const std::string& str ) + { + string_section_accessor strsec = + elf_file.sections[get_string_table_index()]; + Elf_Xword value = strsec.add_string( str ); + add_entry( tag, value ); + } + + //------------------------------------------------------------------------------ + private: + //------------------------------------------------------------------------------ + Elf_Half get_string_table_index() const + { + return (Elf_Half)dynamic_section->get_link(); + } + + //------------------------------------------------------------------------------ + template + void generic_get_entry_dyn( Elf_Xword index, + Elf_Xword& tag, + Elf_Xword& value ) const + { + const endianess_convertor& convertor = elf_file.get_convertor(); + + // Check unusual case when dynamic section has no data + if ( dynamic_section->get_data() == 0 || + ( index + 1 ) * dynamic_section->get_entry_size() > + dynamic_section->get_size() ) { + tag = DT_NULL; + value = 0; + return; + } + + const T* pEntry = reinterpret_cast( + dynamic_section->get_data() + + index * dynamic_section->get_entry_size() ); + tag = convertor( pEntry->d_tag ); + switch ( tag ) { + case DT_NULL: + case DT_SYMBOLIC: + case DT_TEXTREL: + case DT_BIND_NOW: + value = 0; + break; + case DT_NEEDED: + case DT_PLTRELSZ: + case DT_RELASZ: + case DT_RELAENT: + case DT_STRSZ: + case DT_SYMENT: + case DT_SONAME: + case DT_RPATH: + case DT_RELSZ: + case DT_RELENT: + case DT_PLTREL: + case DT_INIT_ARRAYSZ: + case DT_FINI_ARRAYSZ: + case DT_RUNPATH: + case DT_FLAGS: + case DT_PREINIT_ARRAYSZ: + value = convertor( pEntry->d_un.d_val ); + break; + case DT_PLTGOT: + case DT_HASH: + case DT_STRTAB: + case DT_SYMTAB: + case DT_RELA: + case DT_INIT: + case DT_FINI: + case DT_REL: + case DT_DEBUG: + case DT_JMPREL: + case DT_INIT_ARRAY: + case DT_FINI_ARRAY: + case DT_PREINIT_ARRAY: + default: + value = convertor( pEntry->d_un.d_ptr ); + break; + } + } + + //------------------------------------------------------------------------------ + template void generic_add_entry( Elf_Xword tag, Elf_Xword value ) + { + const endianess_convertor& convertor = elf_file.get_convertor(); + + T entry; + + switch ( tag ) { + case DT_NULL: + case DT_SYMBOLIC: + case DT_TEXTREL: + case DT_BIND_NOW: + value = 0; + case DT_NEEDED: + case DT_PLTRELSZ: + case DT_RELASZ: + case DT_RELAENT: + case DT_STRSZ: + case DT_SYMENT: + case DT_SONAME: + case DT_RPATH: + case DT_RELSZ: + case DT_RELENT: + case DT_PLTREL: + case DT_INIT_ARRAYSZ: + case DT_FINI_ARRAYSZ: + case DT_RUNPATH: + case DT_FLAGS: + case DT_PREINIT_ARRAYSZ: + entry.d_un.d_val = convertor( value ); + break; + case DT_PLTGOT: + case DT_HASH: + case DT_STRTAB: + case DT_SYMTAB: + case DT_RELA: + case DT_INIT: + case DT_FINI: + case DT_REL: + case DT_DEBUG: + case DT_JMPREL: + case DT_INIT_ARRAY: + case DT_FINI_ARRAY: + case DT_PREINIT_ARRAY: + default: + entry.d_un.d_ptr = convertor( value ); + break; + } + + entry.d_tag = convertor( tag ); + + dynamic_section->append_data( reinterpret_cast( &entry ), + sizeof( entry ) ); + } + + //------------------------------------------------------------------------------ + private: + const elfio& elf_file; + S* dynamic_section; +}; + +using dynamic_section_accessor = dynamic_section_accessor_template
; +using const_dynamic_section_accessor = + dynamic_section_accessor_template; + +} // namespace ELFIO + +#endif // ELFIO_DYNAMIC_HPP + +/*** End of inlined file: elfio_dynamic.hpp ***/ + + +/*** Start of inlined file: elfio_modinfo.hpp ***/ +#ifndef ELFIO_MODINFO_HPP +#define ELFIO_MODINFO_HPP + +#include +#include + +namespace ELFIO { + +//------------------------------------------------------------------------------ +template class modinfo_section_accessor_template +{ + public: + //------------------------------------------------------------------------------ + modinfo_section_accessor_template( S* section_ ) + : modinfo_section( section_ ) + { + process_section(); + } + + //------------------------------------------------------------------------------ + Elf_Word get_attribute_num() const { return (Elf_Word)content.size(); } + + //------------------------------------------------------------------------------ + bool + get_attribute( Elf_Word no, std::string& field, std::string& value ) const + { + if ( no < content.size() ) { + field = content[no].first; + value = content[no].second; + return true; + } + + return false; + } + + //------------------------------------------------------------------------------ + bool get_attribute( std::string field_name, std::string& value ) const + { + for ( auto i = content.begin(); i != content.end(); i++ ) { + if ( field_name == i->first ) { + value = i->second; + return true; + } + } + + return false; + } + + //------------------------------------------------------------------------------ + Elf_Word add_attribute( std::string field, std::string value ) + { + Elf_Word current_position = 0; + + if ( modinfo_section ) { + // Strings are addeded to the end of the current section data + current_position = (Elf_Word)modinfo_section->get_size(); + + std::string attribute = field + "=" + value; + + modinfo_section->append_data( attribute + '\0' ); + content.push_back( + std::pair( field, value ) ); + } + + return current_position; + } + + //------------------------------------------------------------------------------ + private: + void process_section() + { + const char* pdata = modinfo_section->get_data(); + if ( pdata ) { + ELFIO::Elf_Xword i = 0; + while ( i < modinfo_section->get_size() ) { + while ( i < modinfo_section->get_size() && !pdata[i] ) + i++; + if ( i < modinfo_section->get_size() ) { + std::string info = pdata + i; + size_t loc = info.find( '=' ); + std::pair attribute( + info.substr( 0, loc ), info.substr( loc + 1 ) ); + + content.push_back( attribute ); + + i += info.length(); + } + } + } + } + + //------------------------------------------------------------------------------ + private: + S* modinfo_section; + std::vector> content; +}; + +using modinfo_section_accessor = modinfo_section_accessor_template
; +using const_modinfo_section_accessor = + modinfo_section_accessor_template; + +} // namespace ELFIO + +#endif // ELFIO_MODINFO_HPP + +/*** End of inlined file: elfio_modinfo.hpp ***/ + +#ifdef _MSC_VER +#pragma warning( pop ) +#endif + +#endif // ELFIO_HPP + +/*** End of inlined file: elfio.hpp ***/ + + +namespace ELFIO { + +static struct class_table_t +{ + const char key; + const char* str; +} class_table[] = { + { ELFCLASS32, "ELF32" }, + { ELFCLASS64, "ELF64" }, +}; + +static struct endian_table_t +{ + const char key; + const char* str; +} endian_table[] = { + { ELFDATANONE, "None" }, + { ELFDATA2LSB, "Little endian" }, + { ELFDATA2MSB, "Big endian" }, +}; + +static struct version_table_t +{ + const Elf64_Word key; + const char* str; +} version_table[] = { + { EV_NONE, "None" }, + { EV_CURRENT, "Current" }, +}; + +static struct type_table_t +{ + const Elf32_Half key; + const char* str; +} type_table[] = { + { ET_NONE, "No file type" }, { ET_REL, "Relocatable file" }, + { ET_EXEC, "Executable file" }, { ET_DYN, "Shared object file" }, + { ET_CORE, "Core file" }, +}; + +static struct machine_table_t +{ + const Elf64_Half key; + const char* str; +} machine_table[] = { + { EM_NONE, "No machine" }, + { EM_M32, "AT&T WE 32100" }, + { EM_SPARC, "SUN SPARC" }, + { EM_386, "Intel 80386" }, + { EM_68K, "Motorola m68k family" }, + { EM_88K, "Motorola m88k family" }, + { EM_486, "Intel 80486// Reserved for future use" }, + { EM_860, "Intel 80860" }, + { EM_MIPS, "MIPS R3000 (officially, big-endian only)" }, + { EM_S370, "IBM System/370" }, + { EM_MIPS_RS3_LE, + "MIPS R3000 little-endian (Oct 4 1999 Draft) Deprecated" }, + { EM_res011, "Reserved" }, + { EM_res012, "Reserved" }, + { EM_res013, "Reserved" }, + { EM_res014, "Reserved" }, + { EM_PARISC, "HPPA" }, + { EM_res016, "Reserved" }, + { EM_VPP550, "Fujitsu VPP500" }, + { EM_SPARC32PLUS, "Sun's v8plus" }, + { EM_960, "Intel 80960" }, + { EM_PPC, "PowerPC" }, + { EM_PPC64, "64-bit PowerPC" }, + { EM_S390, "IBM S/390" }, + { EM_SPU, "Sony/Toshiba/IBM SPU" }, + { EM_res024, "Reserved" }, + { EM_res025, "Reserved" }, + { EM_res026, "Reserved" }, + { EM_res027, "Reserved" }, + { EM_res028, "Reserved" }, + { EM_res029, "Reserved" }, + { EM_res030, "Reserved" }, + { EM_res031, "Reserved" }, + { EM_res032, "Reserved" }, + { EM_res033, "Reserved" }, + { EM_res034, "Reserved" }, + { EM_res035, "Reserved" }, + { EM_V800, "NEC V800 series" }, + { EM_FR20, "Fujitsu FR20" }, + { EM_RH32, "TRW RH32" }, + { EM_MCORE, "Motorola M*Core // May also be taken by Fujitsu MMA" }, + { EM_RCE, "Old name for MCore" }, + { EM_ARM, "ARM" }, + { EM_OLD_ALPHA, "Digital Alpha" }, + { EM_SH, "Renesas (formerly Hitachi) / SuperH SH" }, + { EM_SPARCV9, "SPARC v9 64-bit" }, + { EM_TRICORE, "Siemens Tricore embedded processor" }, + { EM_ARC, "ARC Cores" }, + { EM_H8_300, "Renesas (formerly Hitachi) H8/300" }, + { EM_H8_300H, "Renesas (formerly Hitachi) H8/300H" }, + { EM_H8S, "Renesas (formerly Hitachi) H8S" }, + { EM_H8_500, "Renesas (formerly Hitachi) H8/500" }, + { EM_IA_64, "Intel IA-64 Processor" }, + { EM_MIPS_X, "Stanford MIPS-X" }, + { EM_COLDFIRE, "Motorola Coldfire" }, + { EM_68HC12, "Motorola M68HC12" }, + { EM_MMA, "Fujitsu Multimedia Accelerator" }, + { EM_PCP, "Siemens PCP" }, + { EM_NCPU, "Sony nCPU embedded RISC processor" }, + { EM_NDR1, "Denso NDR1 microprocesspr" }, + { EM_STARCORE, "Motorola Star*Core processor" }, + { EM_ME16, "Toyota ME16 processor" }, + { EM_ST100, "STMicroelectronics ST100 processor" }, + { EM_TINYJ, "Advanced Logic Corp. TinyJ embedded processor" }, + { EM_X86_64, "Advanced Micro Devices X86-64 processor" }, + { EM_PDSP, "Sony DSP Processor" }, + { EM_PDP10, "Digital Equipment Corp. PDP-10" }, + { EM_PDP11, "Digital Equipment Corp. PDP-11" }, + { EM_FX66, "Siemens FX66 microcontroller" }, + { EM_ST9PLUS, "STMicroelectronics ST9+ 8/16 bit microcontroller" }, + { EM_ST7, "STMicroelectronics ST7 8-bit microcontroller" }, + { EM_68HC16, "Motorola MC68HC16 Microcontroller" }, + { EM_68HC11, "Motorola MC68HC11 Microcontroller" }, + { EM_68HC08, "Motorola MC68HC08 Microcontroller" }, + { EM_68HC05, "Motorola MC68HC05 Microcontroller" }, + { EM_SVX, "Silicon Graphics SVx" }, + { EM_ST19, "STMicroelectronics ST19 8-bit cpu" }, + { EM_VAX, "Digital VAX" }, + { EM_CRIS, "Axis Communications 32-bit embedded processor" }, + { EM_JAVELIN, "Infineon Technologies 32-bit embedded cpu" }, + { EM_FIREPATH, "Element 14 64-bit DSP processor" }, + { EM_ZSP, "LSI Logic's 16-bit DSP processor" }, + { EM_MMIX, "Donald Knuth's educational 64-bit processor" }, + { EM_HUANY, "Harvard's machine-independent format" }, + { EM_PRISM, "SiTera Prism" }, + { EM_AVR, "Atmel AVR 8-bit microcontroller" }, + { EM_FR30, "Fujitsu FR30" }, + { EM_D10V, "Mitsubishi D10V" }, + { EM_D30V, "Mitsubishi D30V" }, + { EM_V850, "NEC v850" }, + { EM_M32R, "Renesas M32R (formerly Mitsubishi M32R)" }, + { EM_MN10300, "Matsushita MN10300" }, + { EM_MN10200, "Matsushita MN10200" }, + { EM_PJ, "picoJava" }, + { EM_OPENRISC, "OpenRISC 32-bit embedded processor" }, + { EM_ARC_A5, "ARC Cores Tangent-A5" }, + { EM_XTENSA, "Tensilica Xtensa Architecture" }, + { EM_VIDEOCORE, "Alphamosaic VideoCore processor" }, + { EM_TMM_GPP, "Thompson Multimedia General Purpose Processor" }, + { EM_NS32K, "National Semiconductor 32000 series" }, + { EM_TPC, "Tenor Network TPC processor" }, + { EM_SNP1K, "Trebia SNP 1000 processor" }, + { EM_ST200, "STMicroelectronics ST200 microcontroller" }, + { EM_IP2K, "Ubicom IP2022 micro controller" }, + { EM_MAX, "MAX Processor" }, + { EM_CR, "National Semiconductor CompactRISC" }, + { EM_F2MC16, "Fujitsu F2MC16" }, + { EM_MSP430, "TI msp430 micro controller" }, + { EM_BLACKFIN, "ADI Blackfin" }, + { EM_SE_C33, "S1C33 Family of Seiko Epson processors" }, + { EM_SEP, "Sharp embedded microprocessor" }, + { EM_ARCA, "Arca RISC Microprocessor" }, + { EM_UNICORE, "Microprocessor series from PKU-Unity Ltd. and MPRC of " + "Peking University" }, + { EM_EXCESS, "eXcess: 16/32/64-bit configurable embedded CPU" }, + { EM_DXP, "Icera Semiconductor Inc. Deep Execution Processor" }, + { EM_ALTERA_NIOS2, "Altera Nios II soft-core processor" }, + { EM_CRX, "National Semiconductor CRX" }, + { EM_XGATE, "Motorola XGATE embedded processor" }, + { EM_C166, "Infineon C16x/XC16x processor" }, + { EM_M16C, "Renesas M16C series microprocessors" }, + { EM_DSPIC30F, "Microchip Technology dsPIC30F Digital Signal Controller" }, + { EM_CE, "Freescale Communication Engine RISC core" }, + { EM_M32C, "Renesas M32C series microprocessors" }, + { EM_res121, "Reserved" }, + { EM_res122, "Reserved" }, + { EM_res123, "Reserved" }, + { EM_res124, "Reserved" }, + { EM_res125, "Reserved" }, + { EM_res126, "Reserved" }, + { EM_res127, "Reserved" }, + { EM_res128, "Reserved" }, + { EM_res129, "Reserved" }, + { EM_res130, "Reserved" }, + { EM_TSK3000, "Altium TSK3000 core" }, + { EM_RS08, "Freescale RS08 embedded processor" }, + { EM_res133, "Reserved" }, + { EM_ECOG2, "Cyan Technology eCOG2 microprocessor" }, + { EM_SCORE, "Sunplus Score" }, + { EM_SCORE7, "Sunplus S+core7 RISC processor" }, + { EM_DSP24, "New Japan Radio (NJR) 24-bit DSP Processor" }, + { EM_VIDEOCORE3, "Broadcom VideoCore III processor" }, + { EM_LATTICEMICO32, "RISC processor for Lattice FPGA architecture" }, + { EM_SE_C17, "Seiko Epson C17 family" }, + { EM_TI_C6000, "Texas Instruments TMS320C6000 DSP family" }, + { EM_TI_C2000, "Texas Instruments TMS320C2000 DSP family" }, + { EM_TI_C5500, "Texas Instruments TMS320C55x DSP family" }, + { EM_res143, "Reserved" }, + { EM_res144, "Reserved" }, + { EM_res145, "Reserved" }, + { EM_res146, "Reserved" }, + { EM_res147, "Reserved" }, + { EM_res148, "Reserved" }, + { EM_res149, "Reserved" }, + { EM_res150, "Reserved" }, + { EM_res151, "Reserved" }, + { EM_res152, "Reserved" }, + { EM_res153, "Reserved" }, + { EM_res154, "Reserved" }, + { EM_res155, "Reserved" }, + { EM_res156, "Reserved" }, + { EM_res157, "Reserved" }, + { EM_res158, "Reserved" }, + { EM_res159, "Reserved" }, + { EM_MMDSP_PLUS, "STMicroelectronics 64bit VLIW Data Signal Processor" }, + { EM_CYPRESS_M8C, "Cypress M8C microprocessor" }, + { EM_R32C, "Renesas R32C series microprocessors" }, + { EM_TRIMEDIA, "NXP Semiconductors TriMedia architecture family" }, + { EM_QDSP6, "QUALCOMM DSP6 Processor" }, + { EM_8051, "Intel 8051 and variants" }, + { EM_STXP7X, "STMicroelectronics STxP7x family" }, + { EM_NDS32, + "Andes Technology compact code size embedded RISC processor family" }, + { EM_ECOG1, "Cyan Technology eCOG1X family" }, + { EM_ECOG1X, "Cyan Technology eCOG1X family" }, + { EM_MAXQ30, "Dallas Semiconductor MAXQ30 Core Micro-controllers" }, + { EM_XIMO16, "New Japan Radio (NJR) 16-bit DSP Processor" }, + { EM_MANIK, "M2000 Reconfigurable RISC Microprocessor" }, + { EM_CRAYNV2, "Cray Inc. NV2 vector architecture" }, + { EM_RX, "Renesas RX family" }, + { EM_METAG, "Imagination Technologies META processor architecture" }, + { EM_MCST_ELBRUS, "MCST Elbrus general purpose hardware architecture" }, + { EM_ECOG16, "Cyan Technology eCOG16 family" }, + { EM_CR16, "National Semiconductor CompactRISC 16-bit processor" }, + { EM_ETPU, "Freescale Extended Time Processing Unit" }, + { EM_SLE9X, "Infineon Technologies SLE9X core" }, + { EM_L1OM, "Intel L1OM" }, + { EM_INTEL181, "Reserved by Intel" }, + { EM_INTEL182, "Reserved by Intel" }, + { EM_res183, "Reserved by ARM" }, + { EM_res184, "Reserved by ARM" }, + { EM_AVR32, "Atmel Corporation 32-bit microprocessor family" }, + { EM_STM8, "STMicroeletronics STM8 8-bit microcontroller" }, + { EM_TILE64, "Tilera TILE64 multicore architecture family" }, + { EM_TILEPRO, "Tilera TILEPro multicore architecture family" }, + { EM_MICROBLAZE, "Xilinx MicroBlaze 32-bit RISC soft processor core" }, + { EM_CUDA, "NVIDIA CUDA architecture " }, +}; + +static struct section_type_table_t +{ + const Elf64_Half key; + const char* str; +} section_type_table[] = { + { SHT_NULL, "NULL" }, + { SHT_PROGBITS, "PROGBITS" }, + { SHT_SYMTAB, "SYMTAB" }, + { SHT_STRTAB, "STRTAB" }, + { SHT_RELA, "RELA" }, + { SHT_HASH, "HASH" }, + { SHT_DYNAMIC, "DYNAMIC" }, + { SHT_NOTE, "NOTE" }, + { SHT_NOBITS, "NOBITS" }, + { SHT_REL, "REL" }, + { SHT_SHLIB, "SHLIB" }, + { SHT_DYNSYM, "DYNSYM" }, + { SHT_INIT_ARRAY, "INIT_ARRAY" }, + { SHT_FINI_ARRAY, "FINI_ARRAY" }, + { SHT_PREINIT_ARRAY, "PREINIT_ARRAY" }, + { SHT_GROUP, "GROUP" }, + { SHT_SYMTAB_SHNDX, "SYMTAB_SHNDX " }, +}; + +static struct segment_type_table_t +{ + const Elf_Word key; + const char* str; +} segment_type_table[] = { + { PT_NULL, "NULL" }, { PT_LOAD, "LOAD" }, { PT_DYNAMIC, "DYNAMIC" }, + { PT_INTERP, "INTERP" }, { PT_NOTE, "NOTE" }, { PT_SHLIB, "SHLIB" }, + { PT_PHDR, "PHDR" }, { PT_TLS, "TLS" }, +}; + +static struct segment_flag_table_t +{ + const Elf_Word key; + const char* str; +} segment_flag_table[] = { + { 0, "" }, { 1, "X" }, { 2, "W" }, { 3, "WX" }, + { 4, "R" }, { 5, "RX" }, { 6, "RW" }, { 7, "RWX" }, +}; + +static struct symbol_bind_t +{ + const Elf_Word key; + const char* str; +} symbol_bind_table[] = { + { STB_LOCAL, "LOCAL" }, { STB_GLOBAL, "GLOBAL" }, + { STB_WEAK, "WEAK" }, { STB_LOOS, "LOOS" }, + { STB_HIOS, "HIOS" }, { STB_MULTIDEF, "MULTIDEF" }, + { STB_LOPROC, "LOPROC" }, { STB_HIPROC, "HIPROC" }, +}; + +static struct symbol_type_t +{ + const Elf_Word key; + const char* str; +} symbol_type_table[] = { + { STT_NOTYPE, "NOTYPE" }, { STT_OBJECT, "OBJECT" }, + { STT_FUNC, "FUNC" }, { STT_SECTION, "SECTION" }, + { STT_FILE, "FILE" }, { STT_COMMON, "COMMON" }, + { STT_TLS, "TLS" }, { STT_LOOS, "LOOS" }, + { STT_HIOS, "HIOS" }, { STT_LOPROC, "LOPROC" }, + { STT_HIPROC, "HIPROC" }, +}; + +static struct dynamic_tag_t +{ + const Elf_Word key; + const char* str; +} dynamic_tag_table[] = { + { DT_NULL, "NULL" }, + { DT_NEEDED, "NEEDED" }, + { DT_PLTRELSZ, "PLTRELSZ" }, + { DT_PLTGOT, "PLTGOT" }, + { DT_HASH, "HASH" }, + { DT_STRTAB, "STRTAB" }, + { DT_SYMTAB, "SYMTAB" }, + { DT_RELA, "RELA" }, + { DT_RELASZ, "RELASZ" }, + { DT_RELAENT, "RELAENT" }, + { DT_STRSZ, "STRSZ" }, + { DT_SYMENT, "SYMENT" }, + { DT_INIT, "INIT" }, + { DT_FINI, "FINI" }, + { DT_SONAME, "SONAME" }, + { DT_RPATH, "RPATH" }, + { DT_SYMBOLIC, "SYMBOLIC" }, + { DT_REL, "REL" }, + { DT_RELSZ, "RELSZ" }, + { DT_RELENT, "RELENT" }, + { DT_PLTREL, "PLTREL" }, + { DT_DEBUG, "DEBUG" }, + { DT_TEXTREL, "TEXTREL" }, + { DT_JMPREL, "JMPREL" }, + { DT_BIND_NOW, "BIND_NOW" }, + { DT_INIT_ARRAY, "INIT_ARRAY" }, + { DT_FINI_ARRAY, "FINI_ARRAY" }, + { DT_INIT_ARRAYSZ, "INIT_ARRAYSZ" }, + { DT_FINI_ARRAYSZ, "FINI_ARRAYSZ" }, + { DT_RUNPATH, "RUNPATH" }, + { DT_FLAGS, "FLAGS" }, + { DT_ENCODING, "ENCODING" }, + { DT_PREINIT_ARRAY, "PREINIT_ARRAY" }, + { DT_PREINIT_ARRAYSZ, "PREINIT_ARRAYSZ" }, + { DT_MAXPOSTAGS, "MAXPOSTAGS" }, +}; + +static const ELFIO::Elf_Xword MAX_DATA_ENTRIES = 64; + +//------------------------------------------------------------------------------ +class dump +{ +#define DUMP_DEC_FORMAT( width ) \ + std::setw( width ) << std::setfill( ' ' ) << std::dec << std::right +#define DUMP_HEX_FORMAT( width ) \ + std::setw( width ) << std::setfill( '0' ) << std::hex << std::right +#define DUMP_STR_FORMAT( width ) \ + std::setw( width ) << std::setfill( ' ' ) << std::hex << std::left + + public: + //------------------------------------------------------------------------------ + static void header( std::ostream& out, const elfio& reader ) + { + if ( !reader.get_header_size() ) { + return; + } + out << "ELF Header" << std::endl + << std::endl + << " Class: " << str_class( reader.get_class() ) << std::endl + << " Encoding: " << str_endian( reader.get_encoding() ) + << std::endl + << " ELFVersion: " << str_version( reader.get_elf_version() ) + << std::endl + << " Type: " << str_type( reader.get_type() ) << std::endl + << " Machine: " << str_machine( reader.get_machine() ) + << std::endl + << " Version: " << str_version( reader.get_version() ) + << std::endl + << " Entry: " + << "0x" << std::hex << reader.get_entry() << std::endl + << " Flags: " + << "0x" << std::hex << reader.get_flags() << std::endl + << std::endl; + } + + //------------------------------------------------------------------------------ + static void section_headers( std::ostream& out, const elfio& reader ) + { + Elf_Half n = reader.sections.size(); + + if ( n == 0 ) { + return; + } + + out << "Section Headers:" << std::endl; + if ( reader.get_class() == ELFCLASS32 ) { // Output for 32-bit + out << "[ Nr ] Type Addr Size ES Flg Lk Inf " + "Al Name" + << std::endl; + } + else { // Output for 64-bit + out << "[ Nr ] Type Addr Size " + " ES Flg" + << std::endl + << " Lk Inf Al Name" << std::endl; + } + + for ( Elf_Half i = 0; i < n; ++i ) { // For all sections + section* sec = reader.sections[i]; + section_header( out, i, sec, reader.get_class() ); + } + + out << "Key to Flags: W (write), A (alloc), X (execute)\n\n" + << std::endl; + } + + //------------------------------------------------------------------------------ + static void section_header( std::ostream& out, + Elf_Half no, + const section* sec, + unsigned char elf_class ) + { + std::ios_base::fmtflags original_flags = out.flags(); + + if ( elf_class == ELFCLASS32 ) { // Output for 32-bit + out << "[" << DUMP_DEC_FORMAT( 5 ) << no << "] " + << DUMP_STR_FORMAT( 17 ) << str_section_type( sec->get_type() ) + << " " << DUMP_HEX_FORMAT( 8 ) << sec->get_address() << " " + << DUMP_HEX_FORMAT( 8 ) << sec->get_size() << " " + << DUMP_HEX_FORMAT( 2 ) << sec->get_entry_size() << " " + << DUMP_STR_FORMAT( 3 ) << section_flags( sec->get_flags() ) + << " " << DUMP_HEX_FORMAT( 2 ) << sec->get_link() << " " + << DUMP_HEX_FORMAT( 3 ) << sec->get_info() << " " + << DUMP_HEX_FORMAT( 2 ) << sec->get_addr_align() << " " + << DUMP_STR_FORMAT( 17 ) << sec->get_name() << " " << std::endl; + } + else { // Output for 64-bit + out << "[" << DUMP_DEC_FORMAT( 5 ) << no << "] " + << DUMP_STR_FORMAT( 17 ) << str_section_type( sec->get_type() ) + << " " << DUMP_HEX_FORMAT( 16 ) << sec->get_address() << " " + << DUMP_HEX_FORMAT( 16 ) << sec->get_size() << " " + << DUMP_HEX_FORMAT( 4 ) << sec->get_entry_size() << " " + << DUMP_STR_FORMAT( 3 ) << section_flags( sec->get_flags() ) + << " " << std::endl + << " " << DUMP_HEX_FORMAT( 4 ) << sec->get_link() << " " + << DUMP_HEX_FORMAT( 4 ) << sec->get_info() << " " + << DUMP_HEX_FORMAT( 4 ) << sec->get_addr_align() << " " + << DUMP_STR_FORMAT( 17 ) << sec->get_name() << " " << std::endl; + } + + out.flags( original_flags ); + + return; + } + + //------------------------------------------------------------------------------ + static void segment_headers( std::ostream& out, const elfio& reader ) + { + Elf_Half n = reader.segments.size(); + if ( n == 0 ) { + return; + } + + out << "Segment headers:" << std::endl; + if ( reader.get_class() == ELFCLASS32 ) { // Output for 32-bit + out << "[ Nr ] Type VirtAddr PhysAddr FileSize Mem.Size " + "Flags Align" + << std::endl; + } + else { // Output for 64-bit + out << "[ Nr ] Type VirtAddr PhysAddr " + "Flags" + << std::endl + << " FileSize Mem.Size " + "Align" + << std::endl; + } + + for ( Elf_Half i = 0; i < n; ++i ) { + segment* seg = reader.segments[i]; + segment_header( out, i, seg, reader.get_class() ); + } + + out << std::endl; + } + + //------------------------------------------------------------------------------ + static void segment_header( std::ostream& out, + Elf_Half no, + const segment* seg, + unsigned int elf_class ) + { + std::ios_base::fmtflags original_flags = out.flags(); + + if ( elf_class == ELFCLASS32 ) { // Output for 32-bit + out << "[" << DUMP_DEC_FORMAT( 5 ) << no << "] " + << DUMP_STR_FORMAT( 14 ) << str_segment_type( seg->get_type() ) + << " " << DUMP_HEX_FORMAT( 8 ) << seg->get_virtual_address() + << " " << DUMP_HEX_FORMAT( 8 ) << seg->get_physical_address() + << " " << DUMP_HEX_FORMAT( 8 ) << seg->get_file_size() << " " + << DUMP_HEX_FORMAT( 8 ) << seg->get_memory_size() << " " + << DUMP_STR_FORMAT( 8 ) << str_segment_flag( seg->get_flags() ) + << " " << DUMP_HEX_FORMAT( 8 ) << seg->get_align() << " " + << std::endl; + } + else { // Output for 64-bit + out << "[" << DUMP_DEC_FORMAT( 5 ) << no << "] " + << DUMP_STR_FORMAT( 14 ) << str_segment_type( seg->get_type() ) + << " " << DUMP_HEX_FORMAT( 16 ) << seg->get_virtual_address() + << " " << DUMP_HEX_FORMAT( 16 ) << seg->get_physical_address() + << " " << DUMP_STR_FORMAT( 16 ) + << str_segment_flag( seg->get_flags() ) << " " << std::endl + << " " << DUMP_HEX_FORMAT( 16 ) + << seg->get_file_size() << " " << DUMP_HEX_FORMAT( 16 ) + << seg->get_memory_size() << " " << DUMP_HEX_FORMAT( 16 ) + << seg->get_align() << " " << std::endl; + } + + out.flags( original_flags ); + } + + //------------------------------------------------------------------------------ + static void symbol_tables( std::ostream& out, const elfio& reader ) + { + Elf_Half n = reader.sections.size(); + for ( Elf_Half i = 0; i < n; ++i ) { // For all sections + section* sec = reader.sections[i]; + if ( SHT_SYMTAB == sec->get_type() || + SHT_DYNSYM == sec->get_type() ) { + symbol_section_accessor symbols( reader, sec ); + + Elf_Xword sym_no = symbols.get_symbols_num(); + if ( sym_no > 0 ) { + out << "Symbol table (" << sec->get_name() << ")" + << std::endl; + if ( reader.get_class() == + ELFCLASS32 ) { // Output for 32-bit + out << "[ Nr ] Value Size Type Bind " + "Sect Name" + << std::endl; + } + else { // Output for 64-bit + out << "[ Nr ] Value Size Type " + " Bind Sect" + << std::endl + << " Name" << std::endl; + } + for ( Elf_Xword i = 0; i < sym_no; ++i ) { + std::string name; + Elf64_Addr value = 0; + Elf_Xword size = 0; + unsigned char bind = 0; + unsigned char type = 0; + Elf_Half section = 0; + unsigned char other = 0; + symbols.get_symbol( i, name, value, size, bind, type, + section, other ); + symbol_table( out, i, name, value, size, bind, type, + section, reader.get_class() ); + } + + out << std::endl; + } + } + } + } + + //------------------------------------------------------------------------------ + static void symbol_table( std::ostream& out, + Elf_Xword no, + std::string& name, + Elf64_Addr value, + Elf_Xword size, + unsigned char bind, + unsigned char type, + Elf_Half section, + unsigned int elf_class ) + { + std::ios_base::fmtflags original_flags = out.flags(); + + if ( elf_class == ELFCLASS32 ) { // Output for 32-bit + out << "[" << DUMP_DEC_FORMAT( 5 ) << no << "] " + << DUMP_HEX_FORMAT( 8 ) << value << " " << DUMP_HEX_FORMAT( 8 ) + << size << " " << DUMP_STR_FORMAT( 7 ) + << str_symbol_type( type ) << " " << DUMP_STR_FORMAT( 8 ) + << str_symbol_bind( bind ) << " " << DUMP_DEC_FORMAT( 5 ) + << section << " " << DUMP_STR_FORMAT( 1 ) << name << " " + << std::endl; + } + else { // Output for 64-bit + out << "[" << DUMP_DEC_FORMAT( 5 ) << no << "] " + << DUMP_HEX_FORMAT( 16 ) << value << " " + << DUMP_HEX_FORMAT( 16 ) << size << " " << DUMP_STR_FORMAT( 7 ) + << str_symbol_type( type ) << " " << DUMP_STR_FORMAT( 8 ) + << str_symbol_bind( bind ) << " " << DUMP_DEC_FORMAT( 5 ) + << section << " " << std::endl + << " " << DUMP_STR_FORMAT( 1 ) << name << " " + << std::endl; + } + + out.flags( original_flags ); + } + + //------------------------------------------------------------------------------ + static void notes( std::ostream& out, const elfio& reader ) + { + Elf_Half no = reader.sections.size(); + for ( Elf_Half i = 0; i < no; ++i ) { // For all sections + section* sec = reader.sections[i]; + if ( SHT_NOTE == sec->get_type() ) { // Look at notes + note_section_accessor notes( reader, sec ); + Elf_Word no_notes = notes.get_notes_num(); + if ( no > 0 ) { + out << "Note section (" << sec->get_name() << ")" + << std::endl + << " No Type Name" << std::endl; + for ( Elf_Word j = 0; j < no_notes; ++j ) { // For all notes + Elf_Word type; + std::string name; + void* desc; + Elf_Word descsz; + + if ( notes.get_note( j, type, name, desc, descsz ) ) { + // 'name' usually contains \0 at the end. Try to fix it + name = name.c_str(); + note( out, j, type, name ); + } + } + + out << std::endl; + } + } + } + } + + //------------------------------------------------------------------------------ + static void modinfo( std::ostream& out, const elfio& reader ) + { + Elf_Half no = reader.sections.size(); + for ( Elf_Half i = 0; i < no; ++i ) { // For all sections + section* sec = reader.sections[i]; + if ( ".modinfo" == sec->get_name() ) { // Look for the section + out << "Section .modinfo" << std::endl; + + const_modinfo_section_accessor modinfo( sec ); + for ( Elf_Word i = 0; i < modinfo.get_attribute_num(); i++ ) { + std::string field; + std::string value; + if ( modinfo.get_attribute( i, field, value ) ) { + out << " " << std::setw( 20 ) << field + << std::setw( 0 ) << " = " << value << std::endl; + } + } + + out << std::endl; + break; + } + } + } + + //------------------------------------------------------------------------------ + static void + note( std::ostream& out, int no, Elf_Word type, const std::string& name ) + { + out << " [" << DUMP_DEC_FORMAT( 2 ) << no << "] " + << DUMP_HEX_FORMAT( 8 ) << type << " " << DUMP_STR_FORMAT( 1 ) + << name << std::endl; + } + + //------------------------------------------------------------------------------ + static void dynamic_tags( std::ostream& out, const elfio& reader ) + { + Elf_Half n = reader.sections.size(); + for ( Elf_Half i = 0; i < n; ++i ) { // For all sections + section* sec = reader.sections[i]; + if ( SHT_DYNAMIC == sec->get_type() ) { + dynamic_section_accessor dynamic( reader, sec ); + + Elf_Xword dyn_no = dynamic.get_entries_num(); + if ( dyn_no > 0 ) { + out << "Dynamic section (" << sec->get_name() << ")" + << std::endl; + out << "[ Nr ] Tag Name/Value" << std::endl; + for ( Elf_Xword i = 0; i < dyn_no; ++i ) { + Elf_Xword tag = 0; + Elf_Xword value = 0; + std::string str; + dynamic.get_entry( i, tag, value, str ); + dynamic_tag( out, i, tag, value, str, + reader.get_class() ); + if ( DT_NULL == tag ) { + break; + } + } + + out << std::endl; + } + } + } + } + + //------------------------------------------------------------------------------ + static void dynamic_tag( std::ostream& out, + Elf_Xword no, + Elf_Xword tag, + Elf_Xword value, + std::string str, + unsigned int /*elf_class*/ ) + { + out << "[" << DUMP_DEC_FORMAT( 5 ) << no << "] " + << DUMP_STR_FORMAT( 16 ) << str_dynamic_tag( tag ) << " "; + if ( str.empty() ) { + out << DUMP_HEX_FORMAT( 16 ) << value << " "; + } + else { + out << DUMP_STR_FORMAT( 32 ) << str << " "; + } + out << std::endl; + } + + //------------------------------------------------------------------------------ + static void section_data( std::ostream& out, const section* sec ) + { + std::ios_base::fmtflags original_flags = out.flags(); + + out << sec->get_name() << std::endl; + const char* pdata = sec->get_data(); + if ( pdata ) { + ELFIO::Elf_Xword i; + for ( i = 0; i < std::min( sec->get_size(), MAX_DATA_ENTRIES ); + ++i ) { + if ( i % 16 == 0 ) { + out << "[" << DUMP_HEX_FORMAT( 8 ) << i << "]"; + } + + out << " " << DUMP_HEX_FORMAT( 2 ) << ( pdata[i] & 0x000000FF ); + + if ( i % 16 == 15 ) { + out << std::endl; + } + } + if ( i % 16 != 0 ) { + out << std::endl; + } + + out.flags( original_flags ); + } + + return; + } + + //------------------------------------------------------------------------------ + static void section_datas( std::ostream& out, const elfio& reader ) + { + Elf_Half n = reader.sections.size(); + + if ( n == 0 ) { + return; + } + + out << "Section Data:" << std::endl; + + for ( Elf_Half i = 1; i < n; ++i ) { // For all sections + section* sec = reader.sections[i]; + if ( sec->get_type() == SHT_NOBITS ) { + continue; + } + section_data( out, sec ); + } + + out << std::endl; + } + + //------------------------------------------------------------------------------ + static void + segment_data( std::ostream& out, Elf_Half no, const segment* seg ) + { + std::ios_base::fmtflags original_flags = out.flags(); + + out << "Segment # " << no << std::endl; + const char* pdata = seg->get_data(); + if ( pdata ) { + ELFIO::Elf_Xword i; + for ( i = 0; i < std::min( seg->get_file_size(), MAX_DATA_ENTRIES ); + ++i ) { + if ( i % 16 == 0 ) { + out << "[" << DUMP_HEX_FORMAT( 8 ) << i << "]"; + } + + out << " " << DUMP_HEX_FORMAT( 2 ) << ( pdata[i] & 0x000000FF ); + + if ( i % 16 == 15 ) { + out << std::endl; + } + } + if ( i % 16 != 0 ) { + out << std::endl; + } + + out.flags( original_flags ); + } + + return; + } + + //------------------------------------------------------------------------------ + static void segment_datas( std::ostream& out, const elfio& reader ) + { + Elf_Half n = reader.segments.size(); + + if ( n == 0 ) { + return; + } + + out << "Segment Data:" << std::endl; + + for ( Elf_Half i = 0; i < n; ++i ) { // For all sections + segment* seg = reader.segments[i]; + segment_data( out, i, seg ); + } + + out << std::endl; + } + + private: + //------------------------------------------------------------------------------ + template + std::string static find_value_in_table( const T& table, const K& key ) + { + std::string res = "?"; + for ( unsigned int i = 0; i < sizeof( table ) / sizeof( table[0] ); + ++i ) { + if ( table[i].key == key ) { + res = table[i].str; + break; + } + } + + return res; + } + + //------------------------------------------------------------------------------ + template + static std::string format_assoc( const T& table, const K& key ) + { + std::string str = find_value_in_table( table, key ); + if ( str == "?" ) { + std::ostringstream oss; + oss << str << " (0x" << std::hex << key << ")"; + str = oss.str(); + } + + return str; + } + + //------------------------------------------------------------------------------ + template + static std::string format_assoc( const T& table, const char key ) + { + return format_assoc( table, (const int)key ); + } + + //------------------------------------------------------------------------------ + static std::string section_flags( Elf_Xword flags ) + { + std::string ret = ""; + if ( flags & SHF_WRITE ) { + ret += "W"; + } + if ( flags & SHF_ALLOC ) { + ret += "A"; + } + if ( flags & SHF_EXECINSTR ) { + ret += "X"; + } + + return ret; + } + +//------------------------------------------------------------------------------ +#define STR_FUNC_TABLE( name ) \ + template static std::string str_##name( const T key ) \ + { \ + return format_assoc( name##_table, key ); \ + } + + STR_FUNC_TABLE( class ) + STR_FUNC_TABLE( endian ) + STR_FUNC_TABLE( version ) + STR_FUNC_TABLE( type ) + STR_FUNC_TABLE( machine ) + STR_FUNC_TABLE( section_type ) + STR_FUNC_TABLE( segment_type ) + STR_FUNC_TABLE( segment_flag ) + STR_FUNC_TABLE( symbol_bind ) + STR_FUNC_TABLE( symbol_type ) + STR_FUNC_TABLE( dynamic_tag ) + +#undef STR_FUNC_TABLE +#undef DUMP_DEC_FORMAT +#undef DUMP_HEX_FORMAT +#undef DUMP_STR_FORMAT +}; // class dump + +}; // namespace ELFIO + +#endif // ELFIO_DUMP_HPP + +/*** End of inlined file: elfio_dump.hpp ***/ + diff --git a/3rdparty/testutils/cpp-stub/stub.h b/3rdparty/testutils/cpp-stub/stub.h new file mode 100644 index 00000000..c5f2f53f --- /dev/null +++ b/3rdparty/testutils/cpp-stub/stub.h @@ -0,0 +1,360 @@ +#ifndef __STUB_H__ +#define __STUB_H__ + +#ifdef _WIN32 +//windows +#include +#include +#else +//linux +#include +#include +#include +#endif +//c +#include +#include +//c++ +#include + + +#define ADDR(CLASS_NAME,MEMBER_NAME) (&CLASS_NAME::MEMBER_NAME) + +/********************************************************** + replace function +**********************************************************/ +#ifdef _WIN32 +#define CACHEFLUSH(addr, size) FlushInstructionCache(GetCurrentProcess(), addr, size) +#else +#define CACHEFLUSH(addr, size) __builtin___clear_cache(addr, addr + size) +#endif + +#if defined(__aarch64__) || defined(_M_ARM64) + #define CODESIZE 16U + #define CODESIZE_MIN 16U + #define CODESIZE_MAX CODESIZE + // ldr x9, +8 + // br x9 + // addr + #define REPLACE_FAR(t, fn, fn_stub)\ + ((uint32_t*)fn)[0] = 0x58000040 | 9;\ + ((uint32_t*)fn)[1] = 0xd61f0120 | (9 << 5);\ + *(long long *)(fn + 8) = (long long )fn_stub;\ + CACHEFLUSH((char *)fn, CODESIZE); + #define REPLACE_NEAR(t, fn, fn_stub) REPLACE_FAR(t, fn, fn_stub) +#elif defined(__arm__) || defined(_M_ARM) + #define CODESIZE 8U + #define CODESIZE_MIN 8U + #define CODESIZE_MAX CODESIZE + // ldr pc, [pc, #-4] + #define REPLACE_FAR(t, fn, fn_stub)\ + ((uint32_t*)fn)[0] = 0xe51ff004;\ + ((uint32_t*)fn)[1] = (uint32_t)fn_stub;\ + CACHEFLUSH((char *)fn, CODESIZE); + #define REPLACE_NEAR(t, fn, fn_stub) REPLACE_FAR(t, fn, fn_stub) +#elif defined(__mips64) + #define CACHEFLUSH(addr, size) __builtin___clear_cache(addr, addr + size) + #define CODESIZE 80U + #define CODESIZE_MIN 80U + #define CODESIZE_MAX CODESIZE + //mips没有PC指针,所以需要手动入栈出栈 + //120000ce0: 67bdffe0 daddiu sp, sp, -32 //入栈 + //120000ce4: ffbf0018 sd ra, 24(sp) + //120000ce8: ffbe0010 sd s8, 16(sp) + //120000cec: ffbc0008 sd gp, 8(sp) + //120000cf0: 03a0f025 move s8, sp + + //120000d2c: 03c0e825 move sp, s8 //出栈 + //120000d30: dfbf0018 ld ra, 24(sp) + //120000d34: dfbe0010 ld s8, 16(sp) + //120000d38: dfbc0008 ld gp, 8(sp) + //120000d3c: 67bd0020 daddiu sp, sp, 32 + //120000d40: 03e00008 jr ra + + #define REPLACE_FAR(t, fn, fn_stub)\ + ((uint32_t *)fn)[0] = 0x67bdffe0;\ + ((uint32_t *)fn)[1] = 0xffbf0018;\ + ((uint32_t *)fn)[2] = 0xffbe0010;\ + ((uint32_t *)fn)[3] = 0xffbc0008;\ + ((uint32_t *)fn)[4] = 0x03a0f025;\ + *(uint16_t *)(fn + 20) = (long long)fn_stub >> 32;\ + *(fn + 22) = 0x19;\ + *(fn + 23) = 0x24;\ + ((uint32_t *)fn)[6] = 0x0019cc38;\ + *(uint16_t *)(fn + 28) = (long long)fn_stub >> 16;\ + *(fn + 30) = 0x39;\ + *(fn + 31) = 0x37;\ + ((uint32_t *)fn)[8] = 0x0019cc38;\ + *(uint16_t *)(fn + 36) = (long long)fn_stub;\ + *(fn + 38) = 0x39;\ + *(fn + 39) = 0x37;\ + ((uint32_t *)fn)[10] = 0x0320f809;\ + ((uint32_t *)fn)[11] = 0x00000000;\ + ((uint32_t *)fn)[12] = 0x00000000;\ + ((uint32_t *)fn)[13] = 0x03c0e825;\ + ((uint32_t *)fn)[14] = 0xdfbf0018;\ + ((uint32_t *)fn)[15] = 0xdfbe0010;\ + ((uint32_t *)fn)[16] = 0xdfbc0008;\ + ((uint32_t *)fn)[17] = 0x67bd0020;\ + ((uint32_t *)fn)[18] = 0x03e00008;\ + ((uint32_t *)fn)[19] = 0x00000000;\ + CACHEFLUSH((char *)fn, CODESIZE); + #define REPLACE_NEAR(t, fn, fn_stub) REPLACE_FAR(t, fn, fn_stub) +#elif defined(__thumb__) || defined(_M_THUMB) + #error "Thumb is not supported" +#else //__i386__ _x86_64__ + #define CODESIZE 13U + #define CODESIZE_MIN 5U + #define CODESIZE_MAX CODESIZE + //13 byte(jmp m16:64) + //movabs $0x102030405060708,%r11 + //jmpq *%r11 + static void REPLACE_FAR(void *t, char *fn, char *fn_stub) + { + *fn = 0x49; + *(fn + 1) = 0xbb; + *(long long *)(fn + 2) = (long long)fn_stub; + *(fn + 10) = 0x41; + *(fn + 11) = 0xff; + *(fn + 12) = 0xe3; + CACHEFLUSH((char *)fn, CODESIZE); + } + //5 byte(jmp rel32) + #define REPLACE_NEAR(t, fn, fn_stub)\ + *fn = 0xE9;\ + *(int *)(fn + 1) = (int)(fn_stub - fn - CODESIZE_MIN);\ + CACHEFLUSH((char *)fn, CODESIZE); +#endif + +struct func_stub +{ + char *fn; + unsigned char code_buf[CODESIZE]; + bool far_jmp; +}; + +class Stub +{ +public: + Stub() + { +#ifdef _WIN32 + SYSTEM_INFO sys_info; + GetSystemInfo(&sys_info); + m_pagesize = sys_info.dwPageSize; +#else + m_pagesize = sysconf(_SC_PAGE_SIZE); +#endif + + if (m_pagesize < 0) + { + m_pagesize = 4096; + } + } + ~Stub() + { + clear(); + } + + virtual void clear() + { + std::map::iterator iter; + struct func_stub *pstub; + for(iter=m_result.begin(); iter != m_result.end(); iter++) + { + pstub = iter->second; +#ifdef _WIN32 + DWORD lpflOldProtect; + if(0 != VirtualProtect(pageof(pstub->fn), m_pagesize * 2, PAGE_EXECUTE_READWRITE, &lpflOldProtect)) +#else + if (0 == mprotect(pageof(pstub->fn), m_pagesize * 2, PROT_READ | PROT_WRITE | PROT_EXEC)) +#endif + { + + if(pstub->far_jmp) + { + std::memcpy(pstub->fn, pstub->code_buf, CODESIZE_MAX); + } + else + { + std::memcpy(pstub->fn, pstub->code_buf, CODESIZE_MIN); + } + +#ifdef _WIN32 + VirtualProtect(pageof(pstub->fn), m_pagesize * 2, PAGE_EXECUTE_READ, &lpflOldProtect); +#else + CACHEFLUSH(pstub->fn,CODESIZE); + mprotect(pageof(pstub->fn), m_pagesize * 2, PROT_READ | PROT_EXEC); +#endif + } + + iter->second = NULL; + delete pstub; + } + + m_result.clear(); + return; + } + template + bool set(T addr, S addr_stub) + { + char * fn; + char * fn_stub; + fn = addrof(addr); + fn_stub = addrof(addr_stub); + struct func_stub *pstub; + std::map::iterator iter = m_result.find(fn); + + if (iter == m_result.end()) + { + pstub = new func_stub; + //start + pstub->fn = fn; + + if(distanceof(fn, fn_stub)) + { + pstub->far_jmp = true; + std::memcpy(pstub->code_buf, fn, CODESIZE_MAX); + } + else + { + pstub->far_jmp = false; + std::memcpy(pstub->code_buf, fn, CODESIZE_MIN); + } + } + else { + pstub = iter->second; + pstub->far_jmp = distanceof(fn, fn_stub); + } + + + +#ifdef _WIN32 + DWORD lpflOldProtect; + if(0 == VirtualProtect(pageof(pstub->fn), m_pagesize * 2, PAGE_EXECUTE_READWRITE, &lpflOldProtect)) +#else + if (-1 == mprotect(pageof(pstub->fn), static_cast(m_pagesize * 2), PROT_READ | PROT_WRITE | PROT_EXEC)) +#endif + { + throw("stub set memory protect to w+r+x faild"); + return false; + } + + if(pstub->far_jmp) + { + REPLACE_FAR(this, fn, fn_stub); + } + else + { + REPLACE_NEAR(this, fn, fn_stub); + } + + +#ifdef _WIN32 + if(0 == VirtualProtect(pageof(pstub->fn), m_pagesize * 2, PAGE_EXECUTE_READ, &lpflOldProtect)) +#else + if (-1 == mprotect(pageof(pstub->fn), m_pagesize * 2, PROT_READ | PROT_EXEC)) +#endif + { + throw("stub set memory protect to r+x failed"); + return false; + } + m_result.insert(std::pair(fn,pstub)); + return true; + } + + template + bool reset(T addr) + { + char * fn; + fn = addrof(addr); + + std::map::iterator iter = m_result.find(fn); + + if (iter == m_result.end()) + { + return true; + } + struct func_stub *pstub; + pstub = iter->second; + +#ifdef _WIN32 + DWORD lpflOldProtect; + if(0 == VirtualProtect(pageof(pstub->fn), m_pagesize * 2, PAGE_EXECUTE_READWRITE, &lpflOldProtect)) +#else + if (-1 == mprotect(pageof(pstub->fn), m_pagesize * 2, PROT_READ | PROT_WRITE | PROT_EXEC)) +#endif + { + throw("stub reset memory protect to w+r+x faild"); + return false; + } + + if(pstub->far_jmp) + { + std::memcpy(pstub->fn, pstub->code_buf, CODESIZE_MAX); + } + else + { + std::memcpy(pstub->fn, pstub->code_buf, CODESIZE_MIN); + } + +#ifdef _WIN32 + if(0 == VirtualProtect(pageof(pstub->fn), m_pagesize * 2, PAGE_EXECUTE_READ, &lpflOldProtect)) +#else + CACHEFLUSH(pstub->fn,CODESIZE); + if (-1 == mprotect(pageof(pstub->fn), m_pagesize * 2, PROT_READ | PROT_EXEC)) +#endif + { + throw("stub reset memory protect to r+x failed"); + return false; + } + + m_result.erase(iter); + delete pstub; + + return true; + } +protected: + char *pageof(char* addr) + { +#ifdef _WIN32 + return (char *)((unsigned long long)addr & ~(m_pagesize - 1)); +#else + return (char *)((unsigned long)addr & ~(m_pagesize - 1)); +#endif + } + + template + char* addrof(T addr) + { + union + { + T _s; + char* _d; + }ut; + ut._s = addr; + return ut._d; + } + + bool distanceof(char* addr, char* addr_stub) + { + std::ptrdiff_t diff = addr_stub >= addr ? addr_stub - addr : addr - addr_stub; + if((sizeof(addr) > 4) && (((diff >> 31) - 1) > 0)) + { + return true; + } + return false; + } + +protected: +#ifdef _WIN32 + //LLP64 + long long m_pagesize; +#else + //LP64 + long m_pagesize; +#endif + std::map m_result; +}; + +#endif diff --git a/3rdparty/testutils/stub-ext/stub-shadow.cpp b/3rdparty/testutils/stub-ext/stub-shadow.cpp new file mode 100644 index 00000000..05d02092 --- /dev/null +++ b/3rdparty/testutils/stub-ext/stub-shadow.cpp @@ -0,0 +1,35 @@ +// SPDX-FileCopyrightText: 2021 - 2023 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: MIT + +#include "stub-shadow.h" + +namespace stub_ext { + +WrapperMap stub_wrappers; + +Wrapper::Wrapper() +{ + +} + +Wrapper::~Wrapper() +{ + +} + +void freeWrapper(Wrapper *wrapper) +{ + if (!wrapper) + return; + + for (auto iter = stub_wrappers.begin(); iter != stub_wrappers.end();) { + if (iter->second == wrapper) + iter = stub_wrappers.erase(iter); + else + ++iter; + } + + delete wrapper; +} +} diff --git a/3rdparty/testutils/stub-ext/stub-shadow.h b/3rdparty/testutils/stub-ext/stub-shadow.h new file mode 100644 index 00000000..9199a12a --- /dev/null +++ b/3rdparty/testutils/stub-ext/stub-shadow.h @@ -0,0 +1,166 @@ +// SPDX-FileCopyrightText: 2021 - 2023 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: MIT + +#ifndef STUBSHADOW_H +#define STUBSHADOW_H + +#include +#include + +namespace stub_ext { + +#define LAMDA_FUNCTION_TYPE decltype(&Lamda::operator()) + +class Wrapper +{ +public: + Wrapper(); + virtual ~Wrapper(); +}; + +typedef std::unordered_map WrapperMap; +extern WrapperMap stub_wrappers; + +template +class LamdaWrapper : public Wrapper +{ +public: + LamdaWrapper(Lamda func): Wrapper(),_func(func){} + ~LamdaWrapper(){} + Lamda _func; +}; + +template +struct VFLocator +{ + +}; + +template +struct VFLocator +{ + typedef Ret (*Func)(Obj*, Args...); +}; + +template +struct VFLocator +{ + typedef Ret (*Func)(Obj*, Args...); +}; + +template +struct LamdaCaller +{ + +}; + +template +struct LamdaCaller +{ + template + static Ret call(LamdaWrapper *wrapper, OrgArgs&&... args) + { + return wrapper->_func(std::forward(args)...); + } +}; + +template +struct LamdaCaller +{ + template + static Ret call(LamdaWrapper *wrapper, OrgArgs&&... args) + { + return wrapper->_func(); + } +}; + +template +struct FuncShadow +{ + +}; + +template +struct FuncShadow +{ + typedef Ret (*Shadow)(Args...); + typedef Ret RetType; + + static Ret call(Args ...args) + { + Shadow shadow = &call; + long id = (long)shadow; + auto iter = stub_wrappers.find(id); + assert(stub_wrappers.find(id) != stub_wrappers.end()); + LamdaWrapper *wrapper = dynamic_cast *>(iter->second); + return LamdaCaller::call(wrapper, args...); + } +}; + +template +struct FuncShadow +{ + typedef Ret (*Shadow)(Args...); + typedef Ret RetType; + + static Ret call(Args ...args) + { + Shadow shadow = &call; + long id = (long)shadow; + auto iter = stub_wrappers.find(id); + assert(stub_wrappers.find(id) != stub_wrappers.end()); + LamdaWrapper *wrapper = dynamic_cast *>(iter->second); + return LamdaCaller::call(wrapper, args...); + } +}; + +template +struct FuncShadow +{ + typedef Ret (*Shadow)(Obj *,Args...); + typedef Ret RetType; + static Ret call(Obj *obj, Args ...args) + { + Shadow shadow = &call; + long id = (long)shadow; + auto iter = stub_wrappers.find(id); + assert(stub_wrappers.find(id) != stub_wrappers.end()); + LamdaWrapper *wrapper = dynamic_cast *>(iter->second); + return LamdaCaller::call(wrapper, obj, args...); + } +}; + + +template +struct FuncShadow +{ + typedef Ret (*Shadow)(Obj *,Args...); + typedef Ret RetType; + static Ret call(Obj *obj, Args ...args) + { + Shadow shadow = &call; + long id = (long)shadow; + auto iter = stub_wrappers.find(id); + assert(stub_wrappers.find(id) != stub_wrappers.end()); + LamdaWrapper *wrapper = dynamic_cast *>(iter->second); + return LamdaCaller::call(wrapper, obj, args...); + } +}; + +template +typename FuncShadow::Shadow depictShadow(Wrapper **wrapper, Func func, Lamda lamda) +{ + *wrapper = new LamdaWrapper(lamda); + typename FuncShadow::Shadow shadow = &FuncShadow::call; + long id = (long)shadow; + assert(stub_wrappers.find(id) == stub_wrappers.end()); + stub_wrappers.insert(std::make_pair(id,*wrapper)); + return shadow; +} + +void freeWrapper(Wrapper *wrapper); + +} + +#endif // STUBSHADOW_H diff --git a/3rdparty/testutils/stub-ext/stubext.h b/3rdparty/testutils/stub-ext/stubext.h new file mode 100644 index 00000000..a31d815d --- /dev/null +++ b/3rdparty/testutils/stub-ext/stubext.h @@ -0,0 +1,107 @@ +// SPDX-FileCopyrightText: 2021 - 2023 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: MIT + +#ifndef STUBEXT_H +#define STUBEXT_H + +//需修改Stub的私用成员函数和成员变量为保护类型 +#include "stub.h" + +#include "stub-shadow.h" + +#ifdef DEBUG_STUB_INVOKE +// use to make sure the stub function is invoked. +# define __DBG_STUB_INVOKE__ printf("stub at %s:%d is invoked.\n", __FILE__, __LINE__); +#else +# define __DBG_STUB_INVOKE__ +#endif + +#define VADDR(CLASS_NAME, MEMBER_NAME) (typename stub_ext::VFLocator::Func)(&CLASS_NAME::MEMBER_NAME) + +namespace stub_ext { + +class StubExt : public Stub +{ +public: + StubExt() + : Stub() { } + + template + bool set_lamda(T addr, Lamda lamda) + { + char *fn = addrof(addr); + if (m_result.find(fn) != m_result.end()) + reset(addr); + + Wrapper *wrapper = nullptr; + auto addr_stub = depictShadow(&wrapper, addr, lamda); + if (set(addr, addr_stub)) { + m_wrappers.insert(std::make_pair(fn, wrapper)); + return true; + } else { + freeWrapper(wrapper); + } + return false; + } + + template + void reset(T addr) + { + Stub::reset(addr); + char *fn = addrof(addr); + auto iter = m_wrappers.find(fn); + if (iter != m_wrappers.end()) { + freeWrapper(iter->second); + m_wrappers.erase(iter); + } + } + + ~StubExt() + { + clear(); + } + + void clear() override + { + Stub::clear(); + for (auto iter = m_wrappers.begin(); iter != m_wrappers.end(); ++iter) { + freeWrapper(iter->second); + } + m_wrappers.clear(); + } + + template + static void *get_ctor_addr(bool start = true) + { + // the start vairable must be true, or the compiler will optimize out. + if (start) goto Start; + Call_Constructor: + // This line of code will not be executed. + // The purpose of the code is to allow the compiler to generate the assembly code that calls the constructor. + T(); + Start: + // The address of the line of code T() obtained by assembly + char *p = (char *)&&Call_Constructor; // https://gcc.gnu.org/onlinedocs/gcc/Labels-as-Values.html + // CALL rel32 + void *ret = 0; + char pos; + char call = 0xe8; + do { + pos = *p; + if (pos == call) { + ret = p + 5 + (*(int *)(p + 1)); + } + + } while (!ret && (++p)); + + return ret; + } + +protected: + std::map m_wrappers; +}; + +} + +#endif // STUBEXT_H diff --git a/autotests/dfm-search-tests/CMakeLists.txt b/autotests/dfm-search-tests/CMakeLists.txt index 6d1bb862..4c60beed 100644 --- a/autotests/dfm-search-tests/CMakeLists.txt +++ b/autotests/dfm-search-tests/CMakeLists.txt @@ -27,6 +27,7 @@ target_link_libraries(dfm-search-test # Add size_parser source for testing (it's part of dfm-searcher client but we test it here) target_sources(dfm-search-test PRIVATE ${CMAKE_SOURCE_DIR}/src/dfm-search/dfm-search-client/size_parser.cpp + ${CMAKE_SOURCE_DIR}/3rdparty/testutils/stub-ext/stub-shadow.cpp ) target_include_directories(dfm-search-test @@ -34,6 +35,8 @@ target_include_directories(dfm-search-test ${CMAKE_SOURCE_DIR}/src/dfm-search ${CMAKE_SOURCE_DIR}/src/dfm-search/dfm-search-lib ${CMAKE_SOURCE_DIR}/src/dfm-search/dfm-search-client + ${CMAKE_SOURCE_DIR}/3rdparty/testutils/stub-ext + ${CMAKE_SOURCE_DIR}/3rdparty/testutils/cpp-stub ) # Pass source directory for locating rule files at runtime diff --git a/autotests/dfm-search-tests/main.cpp b/autotests/dfm-search-tests/main.cpp index c6e7d191..d1b360b1 100644 --- a/autotests/dfm-search-tests/main.cpp +++ b/autotests/dfm-search-tests/main.cpp @@ -20,6 +20,7 @@ extern QObject *create_tst_IsSemanticQuery(); extern QObject *create_tst_SearchTarget(); extern QObject *create_tst_SemanticQueryBuilderTarget(); extern QObject *create_tst_ContentRetriever(); +extern QObject *create_tst_ContentSearchEngine(); int main(int argc, char *argv[]) { @@ -86,5 +87,9 @@ int main(int argc, char *argv[]) result |= QTest::qExec(testObj15, argc, argv); delete testObj15; + QObject *testObj16 = create_tst_ContentSearchEngine(); + result |= QTest::qExec(testObj16, argc, argv); + delete testObj16; + return result; } diff --git a/autotests/dfm-search-tests/tst_content_search_engine.cpp b/autotests/dfm-search-tests/tst_content_search_engine.cpp new file mode 100644 index 00000000..7eb04fb7 --- /dev/null +++ b/autotests/dfm-search-tests/tst_content_search_engine.cpp @@ -0,0 +1,495 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +using namespace DFMSEARCH; +using namespace Lucene; + +namespace { + +struct TestDocument +{ + QString path; + QString filename; + QString content; + QString ancestorPath; + QString hidden = "N"; + qint64 modifyTime = 1710000000; + qint64 birthTime = 1700000000; + qint64 fileSize = 1024; +}; + +DocumentPtr buildDocument(const TestDocument &docData) +{ + DocumentPtr doc = newLucene(); + doc->add(newLucene(LuceneFieldNames::Content::kPath, docData.path.toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::Content::kFilename, docData.filename.toStdWString(), + Field::STORE_YES, Field::INDEX_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::Content::kContents, docData.content.toStdWString(), + Field::STORE_YES, Field::INDEX_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::Content::kAncestorPaths, docData.ancestorPath.toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::Content::kIsHidden, docData.hidden.toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::Content::kModifyTime, QString::number(docData.modifyTime).toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::Content::kBirthTime, QString::number(docData.birthTime).toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::Content::kFileSize, QString::number(docData.fileSize).toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + return doc; +} + +DocumentPtr buildOcrDocument(const TestDocument &docData) +{ + DocumentPtr doc = newLucene(); + doc->add(newLucene(LuceneFieldNames::OcrText::kPath, docData.path.toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::OcrText::kFilename, docData.filename.toStdWString(), + Field::STORE_YES, Field::INDEX_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::OcrText::kOcrContents, docData.content.toStdWString(), + Field::STORE_YES, Field::INDEX_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::OcrText::kAncestorPaths, docData.ancestorPath.toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::OcrText::kIsHidden, docData.hidden.toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::OcrText::kModifyTime, QString::number(docData.modifyTime).toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::OcrText::kBirthTime, QString::number(docData.birthTime).toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::OcrText::kFileSize, QString::number(docData.fileSize).toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::OcrText::kCheckSum, + QStringLiteral("checksum-%1").arg(docData.filename).toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + return doc; +} + +void createContentIndex(const QString &indexDir, const QList &documents) +{ + QDir().mkpath(indexDir); + + IndexWriterPtr writer = newLucene( + FSDirectory::open(indexDir.toStdWString()), + newLucene(1, 2), + true, + IndexWriter::MaxFieldLengthLIMITED); + + for (const TestDocument &doc : documents) { + writer->addDocument(buildDocument(doc)); + } + + writer->close(); +} + +void createOcrIndex(const QString &indexDir, const QList &documents) +{ + QDir().mkpath(indexDir); + + IndexWriterPtr writer = newLucene( + FSDirectory::open(indexDir.toStdWString()), + newLucene(1, 2), + true, + IndexWriter::MaxFieldLengthLIMITED); + + for (const TestDocument &doc : documents) { + writer->addDocument(buildOcrDocument(doc)); + } + + writer->close(); +} + +SearchOptions createBaseOptions(const QString &searchPath, const QString &indexDir) +{ + (void) indexDir; + SearchOptions options; + options.setSearchMethod(SearchMethod::Indexed); + options.setSearchPath(searchPath); + options.setSyncSearchTimeout(5); + return options; +} + +QStringList resultPaths(const SearchResultExpected &expected) +{ + QStringList paths; + const SearchResultList results = expected.value(); + for (const SearchResult &result : results) { + paths.append(result.path()); + } + return paths; +} + +} // namespace + +class tst_ContentSearchEngine : public QObject +{ + Q_OBJECT + +private Q_SLOTS: + void search_simpleContent_usesTemporaryIndex(); + void search_booleanAnd_matchesContentOnly(); + void search_booleanOr_matchesAnyContent(); + void search_filenameAndContentMixed_requiresBoth(); + void search_mixedAnd_excludesPureFilenameOnlyMatches(); + void search_simpleOcr_usesTemporaryIndex(); + void search_ocrBooleanAnd_matchesOcrContentOnly(); + void search_ocrBooleanOr_matchesAnyOcrContent(); + void search_ocrFilenameAndContentMixed_requiresBoth(); + void search_ocrMixedAnd_excludesPureFilenameOnlyMatches(); +}; + +void tst_ContentSearchEngine::search_simpleContent_usesTemporaryIndex() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + const QString indexDir = tempDir.path() + "/content-index"; + QVERIFY(QDir().mkpath(rootDir)); + + createContentIndex(indexDir, { + { rootDir + "/alpha-report.txt", "alpha-report.txt", "alpha budget summary", rootDir }, + { rootDir + "/meeting-notes.txt", "meeting-notes.txt", "meeting notes and timeline", rootDir }, + }); + + stub_ext::StubExt stub; + stub.set_lamda(DFMSEARCH::Global::contentIndexDirectory, [&indexDir]() { + return indexDir; + }); + + std::unique_ptr engine(SearchEngine::create(SearchType::Content)); + engine->setSearchOptions(createBaseOptions(rootDir, indexDir)); + + const SearchResultExpected expected = engine->searchSync(SearchQuery::createSimpleQuery("budget")); + QVERIFY(expected.hasValue()); + + QCOMPARE(resultPaths(expected), QStringList { rootDir + "/alpha-report.txt" }); +} + +void tst_ContentSearchEngine::search_booleanAnd_matchesContentOnly() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + const QString indexDir = tempDir.path() + "/content-index"; + QVERIFY(QDir().mkpath(rootDir)); + + createContentIndex(indexDir, { + { rootDir + "/roadmap.txt", "roadmap.txt", "alpha budget roadmap", rootDir }, + { rootDir + "/budget-only.txt", "budget-only.txt", "budget only", rootDir }, + { rootDir + "/alpha-only.txt", "alpha-only.txt", "alpha only", rootDir }, + }); + + stub_ext::StubExt stub; + stub.set_lamda(DFMSEARCH::Global::contentIndexDirectory, [&indexDir]() { + return indexDir; + }); + + std::unique_ptr engine(SearchEngine::create(SearchType::Content)); + engine->setSearchOptions(createBaseOptions(rootDir, indexDir)); + + const SearchResultExpected expected = engine->searchSync( + SearchQuery::createBooleanQuery({ "alpha", "budget" }, SearchQuery::BooleanOperator::AND)); + QVERIFY(expected.hasValue()); + + QCOMPARE(resultPaths(expected), QStringList { rootDir + "/roadmap.txt" }); +} + +void tst_ContentSearchEngine::search_booleanOr_matchesAnyContent() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + const QString indexDir = tempDir.path() + "/content-index"; + QVERIFY(QDir().mkpath(rootDir)); + + createContentIndex(indexDir, { + { rootDir + "/alpha.txt", "alpha.txt", "alpha planning", rootDir }, + { rootDir + "/budget.txt", "budget.txt", "budget tracking", rootDir }, + { rootDir + "/other.txt", "other.txt", "travel notes", rootDir }, + }); + + stub_ext::StubExt stub; + stub.set_lamda(DFMSEARCH::Global::contentIndexDirectory, [&indexDir]() { + return indexDir; + }); + + std::unique_ptr engine(SearchEngine::create(SearchType::Content)); + engine->setSearchOptions(createBaseOptions(rootDir, indexDir)); + + const SearchResultExpected expected = engine->searchSync( + SearchQuery::createBooleanQuery({ "alpha", "budget" }, SearchQuery::BooleanOperator::OR)); + QVERIFY(expected.hasValue()); + + const QStringList paths = resultPaths(expected); + QCOMPARE(paths.size(), 2); + QVERIFY(paths.contains(rootDir + "/alpha.txt")); + QVERIFY(paths.contains(rootDir + "/budget.txt")); +} + +void tst_ContentSearchEngine::search_filenameAndContentMixed_requiresBoth() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + const QString indexDir = tempDir.path() + "/content-index"; + QVERIFY(QDir().mkpath(rootDir)); + + createContentIndex(indexDir, { + { rootDir + "/budget-alpha.txt", "budget-alpha.txt", "alpha roadmap", rootDir }, + { rootDir + "/budget-gamma.txt", "budget-gamma.txt", "gamma roadmap", rootDir }, + { rootDir + "/alpha-only.txt", "alpha-only.txt", "alpha roadmap", rootDir }, + }); + + stub_ext::StubExt stub; + stub.set_lamda(DFMSEARCH::Global::contentIndexDirectory, [&indexDir]() { + return indexDir; + }); + + SearchOptions options = createBaseOptions(rootDir, indexDir); + ContentOptionsAPI contentOptions(options); + contentOptions.setFilenameKeyword("budget"); + + std::unique_ptr engine(SearchEngine::create(SearchType::Content)); + engine->setSearchOptions(options); + + const SearchResultExpected expected = engine->searchSync(SearchQuery::createSimpleQuery("alpha")); + QVERIFY(expected.hasValue()); + + QCOMPARE(resultPaths(expected), QStringList { rootDir + "/budget-alpha.txt" }); +} + +void tst_ContentSearchEngine::search_mixedAnd_excludesPureFilenameOnlyMatches() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + const QString indexDir = tempDir.path() + "/content-index"; + QVERIFY(QDir().mkpath(rootDir)); + + createContentIndex(indexDir, { + { rootDir + "/alpha-budget.txt", "alpha-budget.txt", "general notes", rootDir }, + { rootDir + "/alpha-plan.txt", "alpha-plan.txt", "budget implementation details", rootDir }, + { rootDir + "/budget-plan.txt", "budget-plan.txt", "alpha implementation details", rootDir }, + { rootDir + "/alpha-budget-content.txt", "alpha-budget-content.txt", "alpha budget implementation", rootDir }, + }); + + stub_ext::StubExt stub; + stub.set_lamda(DFMSEARCH::Global::contentIndexDirectory, [&indexDir]() { + return indexDir; + }); + + SearchOptions options = createBaseOptions(rootDir, indexDir); + ContentOptionsAPI contentOptions(options); + contentOptions.setFilenameContentMixedAndSearchEnabled(true); + + std::unique_ptr engine(SearchEngine::create(SearchType::Content)); + engine->setSearchOptions(options); + + const SearchResultExpected expected = engine->searchSync( + SearchQuery::createBooleanQuery({ "alpha", "budget" }, SearchQuery::BooleanOperator::AND)); + QVERIFY(expected.hasValue()); + + const QStringList paths = resultPaths(expected); + QCOMPARE(paths.size(), 3); + QVERIFY(!paths.contains(rootDir + "/alpha-budget.txt")); + QVERIFY(paths.contains(rootDir + "/alpha-plan.txt")); + QVERIFY(paths.contains(rootDir + "/budget-plan.txt")); + QVERIFY(paths.contains(rootDir + "/alpha-budget-content.txt")); +} + +void tst_ContentSearchEngine::search_simpleOcr_usesTemporaryIndex() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/ocr-docs"; + const QString indexDir = tempDir.path() + "/ocr-index"; + QVERIFY(QDir().mkpath(rootDir)); + + createOcrIndex(indexDir, { + { rootDir + "/scan-a.png", "scan-a.png", "invoice amount recognized", rootDir }, + { rootDir + "/scan-b.png", "scan-b.png", "meeting room whiteboard", rootDir }, + }); + + stub_ext::StubExt stub; + stub.set_lamda(DFMSEARCH::Global::ocrTextIndexDirectory, [&indexDir]() { + return indexDir; + }); + + std::unique_ptr engine(SearchEngine::create(SearchType::Ocr)); + engine->setSearchOptions(createBaseOptions(rootDir, indexDir)); + + const SearchResultExpected expected = engine->searchSync(SearchQuery::createSimpleQuery("invoice")); + QVERIFY(expected.hasValue()); + + QCOMPARE(resultPaths(expected), QStringList { rootDir + "/scan-a.png" }); +} + +void tst_ContentSearchEngine::search_ocrBooleanAnd_matchesOcrContentOnly() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/ocr-docs"; + const QString indexDir = tempDir.path() + "/ocr-index"; + QVERIFY(QDir().mkpath(rootDir)); + + createOcrIndex(indexDir, { + { rootDir + "/receipt.png", "receipt.png", "invoice amount total", rootDir }, + { rootDir + "/amount-only.png", "amount-only.png", "amount only", rootDir }, + { rootDir + "/invoice-only.png", "invoice-only.png", "invoice only", rootDir }, + }); + + stub_ext::StubExt stub; + stub.set_lamda(DFMSEARCH::Global::ocrTextIndexDirectory, [&indexDir]() { + return indexDir; + }); + + std::unique_ptr engine(SearchEngine::create(SearchType::Ocr)); + engine->setSearchOptions(createBaseOptions(rootDir, indexDir)); + + const SearchResultExpected expected = engine->searchSync( + SearchQuery::createBooleanQuery({ "invoice", "amount" }, SearchQuery::BooleanOperator::AND)); + QVERIFY(expected.hasValue()); + + QCOMPARE(resultPaths(expected), QStringList { rootDir + "/receipt.png" }); +} + +void tst_ContentSearchEngine::search_ocrBooleanOr_matchesAnyOcrContent() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/ocr-docs"; + const QString indexDir = tempDir.path() + "/ocr-index"; + QVERIFY(QDir().mkpath(rootDir)); + + createOcrIndex(indexDir, { + { rootDir + "/invoice.png", "invoice.png", "invoice recognized", rootDir }, + { rootDir + "/budget.png", "budget.png", "budget recognized", rootDir }, + { rootDir + "/other.png", "other.png", "travel receipt", rootDir }, + }); + + stub_ext::StubExt stub; + stub.set_lamda(DFMSEARCH::Global::ocrTextIndexDirectory, [&indexDir]() { + return indexDir; + }); + + std::unique_ptr engine(SearchEngine::create(SearchType::Ocr)); + engine->setSearchOptions(createBaseOptions(rootDir, indexDir)); + + const SearchResultExpected expected = engine->searchSync( + SearchQuery::createBooleanQuery({ "invoice", "budget" }, SearchQuery::BooleanOperator::OR)); + QVERIFY(expected.hasValue()); + + const QStringList paths = resultPaths(expected); + QCOMPARE(paths.size(), 2); + QVERIFY(paths.contains(rootDir + "/invoice.png")); + QVERIFY(paths.contains(rootDir + "/budget.png")); +} + +void tst_ContentSearchEngine::search_ocrFilenameAndContentMixed_requiresBoth() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/ocr-docs"; + const QString indexDir = tempDir.path() + "/ocr-index"; + QVERIFY(QDir().mkpath(rootDir)); + + createOcrIndex(indexDir, { + { rootDir + "/budget-invoice.png", "budget-invoice.png", "invoice details", rootDir }, + { rootDir + "/budget-other.png", "budget-other.png", "other details", rootDir }, + { rootDir + "/invoice-only.png", "invoice-only.png", "invoice details", rootDir }, + }); + + stub_ext::StubExt stub; + stub.set_lamda(DFMSEARCH::Global::ocrTextIndexDirectory, [&indexDir]() { + return indexDir; + }); + + SearchOptions options = createBaseOptions(rootDir, indexDir); + OcrTextOptionsAPI ocrOptions(options); + ocrOptions.setFilenameKeyword("budget"); + + std::unique_ptr engine(SearchEngine::create(SearchType::Ocr)); + engine->setSearchOptions(options); + + const SearchResultExpected expected = engine->searchSync(SearchQuery::createSimpleQuery("invoice")); + QVERIFY(expected.hasValue()); + + QCOMPARE(resultPaths(expected), QStringList { rootDir + "/budget-invoice.png" }); +} + +void tst_ContentSearchEngine::search_ocrMixedAnd_excludesPureFilenameOnlyMatches() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/ocr-docs"; + const QString indexDir = tempDir.path() + "/ocr-index"; + QVERIFY(QDir().mkpath(rootDir)); + + createOcrIndex(indexDir, { + { rootDir + "/invoice-budget.png", "invoice-budget.png", "generic text", rootDir }, + { rootDir + "/invoice-note.png", "invoice-note.png", "budget recognized", rootDir }, + { rootDir + "/budget-note.png", "budget-note.png", "invoice recognized", rootDir }, + { rootDir + "/invoice-budget-content.png", "invoice-budget-content.png", "invoice budget recognized", rootDir }, + }); + + stub_ext::StubExt stub; + stub.set_lamda(DFMSEARCH::Global::ocrTextIndexDirectory, [&indexDir]() { + return indexDir; + }); + + SearchOptions options = createBaseOptions(rootDir, indexDir); + OcrTextOptionsAPI ocrOptions(options); + ocrOptions.setFilenameOcrContentMixedAndSearchEnabled(true); + + std::unique_ptr engine(SearchEngine::create(SearchType::Ocr)); + engine->setSearchOptions(options); + + const SearchResultExpected expected = engine->searchSync( + SearchQuery::createBooleanQuery({ "invoice", "budget" }, SearchQuery::BooleanOperator::AND)); + QVERIFY(expected.hasValue()); + + const QStringList paths = resultPaths(expected); + QCOMPARE(paths.size(), 3); + QVERIFY(!paths.contains(rootDir + "/invoice-budget.png")); + QVERIFY(paths.contains(rootDir + "/invoice-note.png")); + QVERIFY(paths.contains(rootDir + "/budget-note.png")); + QVERIFY(paths.contains(rootDir + "/invoice-budget-content.png")); +} + +QObject *create_tst_ContentSearchEngine() +{ + return new tst_ContentSearchEngine(); +} + +#include "tst_content_search_engine.moc" From a9c2a9edd4c5426675c8494fa839b6930ea6c8d0 Mon Sep 17 00:00:00 2001 From: Zhang Sheng Date: Mon, 25 May 2026 08:50:55 +0800 Subject: [PATCH 32/36] perf: optimize OCR text search document loading MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Added selective field loading for OCR text search results to significantly reduce disk I/O when detailed results are not needed. The change introduces a MapFieldSelector to only load necessary fields (path is always loaded, ocr_contents only when full text retrieval is enabled, and additional metadata fields only when detailed results are requested). Technical details: 1. Implemented field selector to skip loading large OCR text content (ocr_contents) unless full text retrieval is enabled 2. Only loads additional metadata fields (filename, timestamps, etc.) when detailed results are requested 3. Maintains all existing functionality while reducing memory usage and disk I/O 4. Preserves backward compatibility with existing search options Influence: 1. Test search functionality with both simple and detailed results requests 2. Verify performance improvement when detailed results are disabled 3. Check memory usage reduction during large result set searches 4. Validate that all required fields are correctly loaded when detailed results are enabled 5. Test edge cases with empty fields or missing document attributes perf: 优化OCR文本搜索文档加载性能 为OCR文本搜索结果添加了选择性字段加载功能,当不需要详细结果时显著减少磁 盘I/O。该变更引入了MapFieldSelector,仅加载必要的字段(路径总是加载,只 有在启用全文检索时才加载ocr_contents,仅在请求详细结果时才加载额外的元数 据字段)。 技术细节: 1. 实现字段选择器以跳过加载大型OCR文本内容(ocr_contents),除非启用全文 检索 2. 只有请求详细结果时才加载额外元数据字段(文件名、时间戳等) 3. 在减少内存使用和磁盘I/O的同时保留所有现有功能 4. 保持与现有搜索选项的向后兼容性 Influence: 1. 使用简单和详细结果请求测试搜索功能 2. 验证关闭详细结果时的性能提升 3. 检查大型结果集搜索时的内存使用减少情况 4. 验证启用详细结果时是否正确加载所有所需字段 5. 测试空字段或缺失文档属性的边界情况 --- .../ocrtextstrategies/indexedstrategy.cpp | 24 +++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.cpp b/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.cpp index fd821037..5361134e 100644 --- a/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.cpp +++ b/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -309,6 +310,25 @@ void OcrTextIndexedStrategy::processSearchResults(const Lucene::IndexSearcherPtr bool enableHTML = optAPI.isSearchResultHighlightEnabled(); int previewLen = optAPI.maxPreviewLength() > 0 ? optAPI.maxPreviewLength() : 50; bool enableRetrieval = optAPI.isFullTextRetrievalEnabled(); + bool detailedResults = m_options.detailedResultsEnabled(); + + // Build field selector to avoid loading the large 'ocr_contents' field when not needed. + // The ocr_contents field stores OCR-recognized text and loading it for every result + // (even when only path is needed) causes significant disk I/O overhead. + Lucene::Collection fieldsToLoad = Lucene::Collection::newInstance(); + if (enableRetrieval) { + fieldsToLoad.add(LuceneFieldNames::OcrText::kOcrContents); + } + fieldsToLoad.add(LuceneFieldNames::OcrText::kPath); + if (Q_UNLIKELY(detailedResults)) { + fieldsToLoad.add(LuceneFieldNames::OcrText::kFilename); + fieldsToLoad.add(LuceneFieldNames::OcrText::kIsHidden); + fieldsToLoad.add(LuceneFieldNames::OcrText::kModifyTime); + fieldsToLoad.add(LuceneFieldNames::OcrText::kBirthTime); + fieldsToLoad.add(LuceneFieldNames::OcrText::kCheckSum); + fieldsToLoad.add(LuceneFieldNames::OcrText::kFileSize); + } + Lucene::FieldSelectorPtr fieldSelector = newLucene(fieldsToLoad); // Pre-allocate to avoid reallocation during append m_results.reserve(m_results.size() + static_cast(docsSize)); @@ -328,7 +348,7 @@ void OcrTextIndexedStrategy::processSearchResults(const Lucene::IndexSearcherPtr Lucene::DocumentPtr doc; try { - doc = searcher->doc(scoreDoc->doc); + doc = searcher->doc(scoreDoc->doc, fieldSelector); if (!doc) { qWarning() << "Failed to retrieve document at index:" << scoreDoc->doc; continue; @@ -374,7 +394,7 @@ void OcrTextIndexedStrategy::processSearchResults(const Lucene::IndexSearcherPtr } // 设置详细结果(如果启用) - if (Q_UNLIKELY(m_options.detailedResultsEnabled())) { + if (Q_UNLIKELY(detailedResults)) { // 文件名 Lucene::String filenameField = doc->get(LuceneFieldNames::OcrText::kFilename); if (!filenameField.empty()) { From 06a530de016bed9ece4b457d095e7545afe21d14 Mon Sep 17 00:00:00 2001 From: Zhang Sheng Date: Mon, 25 May 2026 17:16:05 +0800 Subject: [PATCH 33/36] perf: replace chinese analyzer with ngram search MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Removed chinese analyzer and tokenizer files from fulltext 3rdparty 2. Updated CMakeLists.txt to exclude fulltext 3rdparty files 3. Modified QueryBuilder to use ngram search instead of chinese analyzer 4. Simplified query building methods and removed analyzer dependencies 5. Improved pinyin and acronym search using ngram queries This change replaces the complex chinese analyzer implementation with a simpler ngram-based search approach which should provide more consistent results while being easier to maintain. The ngram search is now used for all text matching including pinyin and acronym searches. Log: Improved search functionality using ngram matching instead of chinese analyzer Influence: 1. Test basic search functionality with chinese characters 2. Verify pinyin and acronym search results 3. Test combined search queries with multiple terms 4. Verify wildcard searches work correctly 5. Test case sensitive/insensitive searches 6. Check performance impact from analyzer removal perf: 使用ngram搜索替换中文分词器 1. 删除fulltext第三方库中的中文分词器和分析器文件 2. 更新CMakeLists.txt以排除fulltext第三方文件 3. 修改QueryBuilder使用ngram搜索代替中文分词器 4. 简化查询构建方法并移除分析器依赖 5. 使用ngram查询改进拼音和拼音首字母搜索 此次变更用更简单的基于ngram的搜索方法替代了复杂的中文分词器实现,应能提 供更一致的结果同时更易于维护。ngram搜索现在用于包括拼音和拼音首字母在内 的所有文本匹配。 Log: 使用ngram匹配替代中文分词器改进搜索功能 Influence: 1. 测试中文汉字的基础搜索功能 2. 验证拼音和拼音首字母搜索结果 3. 测试包含多条件的组合搜索查询 4. 验证通配符搜索是否正常工作 5. 测试区分大小写/不区分大小写的搜索 6. 检查删除分词器对性能的影响 --- .reuse/dep5 | 5 - .../3rdparty/fulltext/chineseanalyzer.cpp | 49 ------- .../3rdparty/fulltext/chineseanalyzer.h | 53 ------- .../3rdparty/fulltext/chinesetokenizer.cpp | 130 ------------------ .../3rdparty/fulltext/chinesetokenizer.h | 69 ---------- src/dfm-search/dfm-search-lib/CMakeLists.txt | 2 - .../filenamestrategies/indexedstrategy.cpp | 93 +++---------- .../filenamestrategies/indexedstrategy.h | 12 +- 8 files changed, 19 insertions(+), 394 deletions(-) delete mode 100644 src/dfm-search/3rdparty/fulltext/chineseanalyzer.cpp delete mode 100644 src/dfm-search/3rdparty/fulltext/chineseanalyzer.h delete mode 100644 src/dfm-search/3rdparty/fulltext/chinesetokenizer.cpp delete mode 100644 src/dfm-search/3rdparty/fulltext/chinesetokenizer.h diff --git a/.reuse/dep5 b/.reuse/dep5 index ff15d1f2..82dcd88e 100644 --- a/.reuse/dep5 +++ b/.reuse/dep5 @@ -43,8 +43,3 @@ Files: src/dfm-burn/3rdparty/udfclient/* Copyright: Reinoud Zandijk License: ClArtistic -# fulltext -Files: src/dfm-search/3rdparty/fulltext/* -Copyright: 2009-2014 Alan Wright -License: LGPL-3.0-or-later - diff --git a/src/dfm-search/3rdparty/fulltext/chineseanalyzer.cpp b/src/dfm-search/3rdparty/fulltext/chineseanalyzer.cpp deleted file mode 100644 index dfe14a1a..00000000 --- a/src/dfm-search/3rdparty/fulltext/chineseanalyzer.cpp +++ /dev/null @@ -1,49 +0,0 @@ -///////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2009-2014 Alan Wright. All rights reserved. -// Distributable under the terms of either the Apache License (Version 2.0) -// or the GNU Lesser General Public License. -///////////////////////////////////////////////////////////////////////////// - -#include "chineseanalyzer.h" -#include "chinesetokenizer.h" - -#include -#include - -#define UNUSED(x) (void)x; - -namespace Lucene { - -ChineseAnalyzer::~ChineseAnalyzer() -{ -} - -TokenStreamPtr ChineseAnalyzer::tokenStream(const String &fieldName, const ReaderPtr &reader) -{ - UNUSED(fieldName) - - TokenStreamPtr result = newLucene(reader); - result = newLucene(result); - return result; -} - -TokenStreamPtr ChineseAnalyzer::reusableTokenStream(const String &fieldName, const ReaderPtr &reader) -{ - UNUSED(fieldName) - - ChineseAnalyzerSavedStreamsPtr streams(boost::dynamic_pointer_cast(getPreviousTokenStream())); - if (!streams) { - streams = newLucene(); - streams->source = newLucene(reader); - setPreviousTokenStream(streams); - } else { - streams->source->reset(reader); - } - return streams->source; -} - -ChineseAnalyzerSavedStreams::~ChineseAnalyzerSavedStreams() -{ -} - -} diff --git a/src/dfm-search/3rdparty/fulltext/chineseanalyzer.h b/src/dfm-search/3rdparty/fulltext/chineseanalyzer.h deleted file mode 100644 index 32bd3f2c..00000000 --- a/src/dfm-search/3rdparty/fulltext/chineseanalyzer.h +++ /dev/null @@ -1,53 +0,0 @@ -///////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2009-2014 Alan Wright. All rights reserved. -// Distributable under the terms of either the Apache License (Version 2.0) -// or the GNU Lesser General Public License. -///////////////////////////////////////////////////////////////////////////// - -#ifndef CHINESEANALYZER_H -#define CHINESEANALYZER_H - -#include -#include - -namespace Lucene { - -/** - * An Analyzer that tokenizes text with ChineseTokenizer - * Only used for Lucene++ - */ -class LPPCONTRIBAPI ChineseAnalyzer : public Analyzer -{ -public: - virtual ~ChineseAnalyzer(); - - LUCENE_CLASS(ChineseAnalyzer); - -public: - /// Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}. - /// - /// @return A {@link TokenStream} built from {@link ChineseTokenizer}, filtered with {@link ChineseFilter} - virtual TokenStreamPtr tokenStream(const String &fieldName, const ReaderPtr &reader); - - /// Returns a (possibly reused) {@link TokenStream} which tokenizes all the text in the - /// provided {@link Reader}. - /// - /// @return A {@link TokenStream} built from {@link ChineseTokenizer}, filtered with {@link ChineseFilter} - virtual TokenStreamPtr reusableTokenStream(const String &fieldName, const ReaderPtr &reader); -}; - -class LPPCONTRIBAPI ChineseAnalyzerSavedStreams : public LuceneObject -{ -public: - virtual ~ChineseAnalyzerSavedStreams(); - - LUCENE_CLASS(ChineseAnalyzerSavedStreams); - -public: - TokenizerPtr source; - TokenStreamPtr result; -}; - -} - -#endif // CHINESEANALYZER_H diff --git a/src/dfm-search/3rdparty/fulltext/chinesetokenizer.cpp b/src/dfm-search/3rdparty/fulltext/chinesetokenizer.cpp deleted file mode 100644 index addb9203..00000000 --- a/src/dfm-search/3rdparty/fulltext/chinesetokenizer.cpp +++ /dev/null @@ -1,130 +0,0 @@ -///////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2009-2014 Alan Wright. All rights reserved. -// Distributable under the terms of either the Apache License (Version 2.0) -// or the GNU Lesser General Public License. -///////////////////////////////////////////////////////////////////////////// - -#include -#include -#include -#include -#include -#include -#include - -#include "chinesetokenizer.h" - -namespace Lucene { - -const int32_t ChineseTokenizer::kMaxWordLen = 255; -const int32_t ChineseTokenizer::kIoBufferSize = 1024; - -ChineseTokenizer::ChineseTokenizer(const ReaderPtr &input) - : Tokenizer(input) -{ -} - -ChineseTokenizer::ChineseTokenizer(const AttributeSourcePtr &source, const ReaderPtr &input) - : Tokenizer(source, input) -{ -} - -ChineseTokenizer::ChineseTokenizer(const AttributeFactoryPtr &factory, const ReaderPtr &input) - : Tokenizer(factory, input) -{ -} - -ChineseTokenizer::~ChineseTokenizer() -{ -} - -void ChineseTokenizer::initialize() -{ - offset = 0; - bufferIndex = 0; - dataLen = 0; - buffer = CharArray::newInstance(kMaxWordLen); - memset(buffer.get(), 0, kMaxWordLen); - ioBuffer = CharArray::newInstance(kIoBufferSize); - memset(ioBuffer.get(), 0, kIoBufferSize); - length = 0; - start = 0; - - termAtt = addAttribute(); - offsetAtt = addAttribute(); -} - -void ChineseTokenizer::push(wchar_t c) -{ - if (length == 0) { - start = offset - 1; // start of token - } - buffer[length++] = CharFolder::toLower(c); // buffer it -} - -bool ChineseTokenizer::flush() -{ - if (length > 0) { - termAtt->setTermBuffer(buffer.get(), 0, length); - offsetAtt->setOffset(correctOffset(start), correctOffset(start + length)); - return true; - } else { - return false; - } -} - -bool ChineseTokenizer::incrementToken() -{ - clearAttributes(); - - length = 0; - start = offset; - - while (true) { - wchar_t c; - ++offset; - - if (bufferIndex >= dataLen) { - dataLen = input->read(ioBuffer.get(), 0, ioBuffer.size()); - bufferIndex = 0; - } - - if (dataLen == -1) { - --offset; - return flush(); - } else { - c = ioBuffer[bufferIndex++]; - } - - if (length > 0) { - --bufferIndex; - --offset; - return flush(); - } - push(c); - return flush(); - } -} - -void ChineseTokenizer::end() -{ - // set final offset - int32_t finalOffset = correctOffset(offset); - offsetAtt->setOffset(finalOffset, finalOffset); -} - -void ChineseTokenizer::reset() -{ - Tokenizer::reset(); - offset = 0; - bufferIndex = 0; - dataLen = 0; -} - -void ChineseTokenizer::reset(const ReaderPtr &input) -{ - Tokenizer::reset(input); - reset(); -} - -} diff --git a/src/dfm-search/3rdparty/fulltext/chinesetokenizer.h b/src/dfm-search/3rdparty/fulltext/chinesetokenizer.h deleted file mode 100644 index c93759d6..00000000 --- a/src/dfm-search/3rdparty/fulltext/chinesetokenizer.h +++ /dev/null @@ -1,69 +0,0 @@ -///////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2009-2014 Alan Wright. All rights reserved. -// Distributable under the terms of either the Apache License (Version 2.0) -// or the GNU Lesser General Public License. -///////////////////////////////////////////////////////////////////////////// - -#ifndef CHINESETOKENIZER_H -#define CHINESETOKENIZER_H - -#include - -/** - * An tokenizer that tokenizes chinese - * Only used for Lucene++ - */ -namespace Lucene { -class ChineseTokenizer : public Tokenizer -{ -public: - explicit ChineseTokenizer(const ReaderPtr &input); - ChineseTokenizer(const AttributeSourcePtr &source, const ReaderPtr &input); - ChineseTokenizer(const AttributeFactoryPtr &factory, const ReaderPtr &input); - - virtual ~ChineseTokenizer(); - - LUCENE_CLASS(ChineseTokenizer); - -protected: - /// Max word length - static const int32_t kMaxWordLen; - - static const int32_t kIoBufferSize; - -protected: - /// word offset, used to imply which character(in) is parsed - int32_t offset; - - /// the index used only for ioBuffer - int32_t bufferIndex; - - /// data length - int32_t dataLen; - - /// character buffer, store the characters which are used to compose the returned Token - CharArray buffer; - - /// I/O buffer, used to store the content of the input (one of the members of Tokenizer) - CharArray ioBuffer; - - TermAttributePtr termAtt; - OffsetAttributePtr offsetAtt; - - int32_t length; - int32_t start; - -public: - virtual void initialize(); - virtual bool incrementToken(); - virtual void end(); - virtual void reset(); - virtual void reset(const ReaderPtr &input); - -protected: - void push(wchar_t c); - bool flush(); -}; -} - -#endif // CHINESETOKENIZER_H diff --git a/src/dfm-search/dfm-search-lib/CMakeLists.txt b/src/dfm-search/dfm-search-lib/CMakeLists.txt index b8c12c62..fe26131b 100644 --- a/src/dfm-search/dfm-search-lib/CMakeLists.txt +++ b/src/dfm-search/dfm-search-lib/CMakeLists.txt @@ -11,8 +11,6 @@ file(GLOB_RECURSE PUBLIC_INCLUDES CONFIGURE_DEPENDS FILE (GLOB_RECURSE SRCS CONFIGURE_DEPENDS "./*.cpp" "./*.h" - "../3rdparty/*.cpp" - "../3rdparty/*.h" ) # Qt6 diff --git a/src/dfm-search/dfm-search-lib/filenamesearch/filenamestrategies/indexedstrategy.cpp b/src/dfm-search/dfm-search-lib/filenamesearch/filenamestrategies/indexedstrategy.cpp index 45487e71..985f7dde 100644 --- a/src/dfm-search/dfm-search-lib/filenamesearch/filenamestrategies/indexedstrategy.cpp +++ b/src/dfm-search/dfm-search-lib/filenamesearch/filenamestrategies/indexedstrategy.cpp @@ -16,7 +16,6 @@ #include #include -#include "3rdparty/fulltext/chineseanalyzer.h" #include "utils/cancellablecollector.h" #include "utils/searchutility.h" #include "utils/lucenequeryutils.h" @@ -85,9 +84,8 @@ Lucene::QueryPtr QueryBuilder::buildPinyinQuery(const QStringList &pinyins, Sear for (const QString &pinyin : pinyins) { QString cleanPinyin = pinyin.trimmed(); if (!cleanPinyin.isEmpty() && Global::isPinyinSequence(cleanPinyin)) { - // 复用buildCommonQuery,指定pinyin字段,让分析器自动处理匹配 - QueryPtr termQuery = buildCommonQuery(cleanPinyin, false, newLucene(), - QString::fromWCharArray(LuceneFieldNames::FileName::kPinyin), false); + QueryPtr termQuery = LuceneQueryUtils::buildNGramSearchQuery( + QString::fromWCharArray(LuceneFieldNames::FileName::kPinyin), cleanPinyin); if (termQuery) { pinyinQuery->add(termQuery, op == SearchQuery::BooleanOperator::AND ? BooleanClause::MUST : BooleanClause::SHOULD); } @@ -108,10 +106,8 @@ Lucene::QueryPtr QueryBuilder::buildPinyinAcronymQuery(const QStringList &acrony for (const QString &acronym : acronyms) { QString cleanAcronym = acronym.trimmed(); if (!cleanAcronym.isEmpty()) { - // 复用buildCommonQuery,指定pinyin_acronym字段,让分析器自动处理匹配 - QueryPtr termQuery = buildCommonQuery(cleanAcronym, false, - newLucene(), - QString::fromWCharArray(LuceneFieldNames::FileName::kPinyinAcronym), false); + QueryPtr termQuery = LuceneQueryUtils::buildNGramSearchQuery( + QString::fromWCharArray(LuceneFieldNames::FileName::kPinyinAcronym), cleanAcronym); if (termQuery) { acronymQuery->add(termQuery, op == SearchQuery::BooleanOperator::AND ? BooleanClause::MUST : BooleanClause::SHOULD); } @@ -121,48 +117,13 @@ Lucene::QueryPtr QueryBuilder::buildPinyinAcronymQuery(const QStringList &acrony return acronymQuery; } -Lucene::QueryPtr QueryBuilder::buildCommonQuery(const QString &keyword, bool caseSensitive, const Lucene::AnalyzerPtr &analyzer, bool allowWildcard) const +Lucene::QueryPtr QueryBuilder::buildSimpleQuery(const QString &keyword, bool caseSensitive) const { - if (keyword.isEmpty() || !analyzer) { - return nullptr; - } - - Lucene::QueryParserPtr parser = newLucene( - Lucene::LuceneVersion::LUCENE_CURRENT, - LuceneFieldNames::FileName::kFileName, - analyzer); - - if (allowWildcard) { - parser->setAllowLeadingWildcard(true); - } - - return parser->parse(LuceneQueryUtils::processQueryString(keyword, caseSensitive)); -} - -Lucene::QueryPtr QueryBuilder::buildCommonQuery(const QString &keyword, bool caseSensitive, const Lucene::AnalyzerPtr &analyzer, const QString &fieldName, bool allowWildcard) const -{ - if (keyword.isEmpty() || !analyzer || fieldName.isEmpty()) { - return nullptr; - } - - Lucene::QueryParserPtr parser = newLucene( - Lucene::LuceneVersion::LUCENE_CURRENT, - StringUtils::toUnicode(fieldName.toStdString()), - analyzer); - - if (allowWildcard) { - parser->setAllowLeadingWildcard(true); - } - - return parser->parse(LuceneQueryUtils::processQueryString(keyword, caseSensitive)); + return LuceneQueryUtils::buildNGramSearchQuery( + QString::fromWCharArray(LuceneFieldNames::FileName::kFileName), keyword, caseSensitive); } -Lucene::QueryPtr QueryBuilder::buildSimpleQuery(const QString &keyword, bool caseSensitive, const Lucene::AnalyzerPtr &analyzer) const -{ - return buildCommonQuery(keyword, caseSensitive, analyzer, false); -} - -Lucene::QueryPtr QueryBuilder::buildWildcardQuery(const QString &keyword, bool caseSensitive, const Lucene::AnalyzerPtr &analyzer) const +Lucene::QueryPtr QueryBuilder::buildWildcardQuery(const QString &keyword, bool caseSensitive) const { if (keyword.isEmpty()) { return nullptr; @@ -177,27 +138,6 @@ Lucene::QueryPtr QueryBuilder::buildWildcardQuery(const QString &keyword, bool c StringUtils::toUnicode(processedKeyword.toStdString()))); } -Lucene::QueryPtr QueryBuilder::buildBooleanQuery(const QStringList &terms, bool caseSensitive, SearchQuery::BooleanOperator op, const Lucene::AnalyzerPtr &analyzer) const -{ - if (terms.isEmpty() || !analyzer) { - return nullptr; - } - - BooleanQueryPtr booleanQuery = newLucene(); - booleanQuery->setMaxClauseCount(1024); - - for (const QString &term : terms) { - if (!term.isEmpty()) { - QueryPtr termQuery = buildCommonQuery(term, caseSensitive, analyzer, false); - if (termQuery) { - booleanQuery->add(termQuery, op == SearchQuery::BooleanOperator::AND ? BooleanClause::MUST : BooleanClause::SHOULD); - } - } - } - - return booleanQuery; -} - //-------------------------------------------------------------------- // IndexManager 实现 //-------------------------------------------------------------------- @@ -621,12 +561,11 @@ Lucene::QueryPtr FileNameIndexedStrategy::buildLuceneQuery(const IndexQuery &que { BooleanQueryPtr finalQuery = newLucene(); bool hasValidQuery = false; - AnalyzerPtr analyzer = newLucene(); switch (query.type) { case SearchType::Simple: if (!query.terms.isEmpty()) { - QueryPtr simpleQuery = m_queryBuilder->buildSimpleQuery(query.terms.first(), query.caseSensitive, analyzer); + QueryPtr simpleQuery = m_queryBuilder->buildSimpleQuery(query.terms.first(), query.caseSensitive); if (simpleQuery) { finalQuery->add(simpleQuery, BooleanClause::MUST); hasValidQuery = true; @@ -635,7 +574,7 @@ Lucene::QueryPtr FileNameIndexedStrategy::buildLuceneQuery(const IndexQuery &que break; case SearchType::Wildcard: if (!query.terms.isEmpty()) { - QueryPtr wildcardQuery = m_queryBuilder->buildWildcardQuery(query.terms.first(), query.caseSensitive, analyzer); + QueryPtr wildcardQuery = m_queryBuilder->buildWildcardQuery(query.terms.first(), query.caseSensitive); if (wildcardQuery) { finalQuery->add(wildcardQuery, BooleanClause::MUST); hasValidQuery = true; @@ -644,7 +583,7 @@ Lucene::QueryPtr FileNameIndexedStrategy::buildLuceneQuery(const IndexQuery &que break; case SearchType::Boolean: if (!query.terms.isEmpty()) { - BooleanQueryPtr booleanQuery = buildBooleanTermsQuery(query, analyzer); + BooleanQueryPtr booleanQuery = buildBooleanTermsQuery(query); if (booleanQuery) { finalQuery->add(booleanQuery, BooleanClause::MUST); hasValidQuery = true; @@ -665,7 +604,7 @@ Lucene::QueryPtr FileNameIndexedStrategy::buildLuceneQuery(const IndexQuery &que } // 添加普通关键词查询 - QueryPtr simpleQuery = m_queryBuilder->buildSimpleQuery(query.terms.first(), query.caseSensitive, analyzer); + QueryPtr simpleQuery = m_queryBuilder->buildSimpleQuery(query.terms.first(), query.caseSensitive); if (simpleQuery) { combinedQuery->add(simpleQuery, BooleanClause::SHOULD); hasValidQuery = true; @@ -690,7 +629,7 @@ Lucene::QueryPtr FileNameIndexedStrategy::buildLuceneQuery(const IndexQuery &que } // 添加普通关键词查询 - QueryPtr simpleQuery = m_queryBuilder->buildSimpleQuery(query.terms.first(), query.caseSensitive, analyzer); + QueryPtr simpleQuery = m_queryBuilder->buildSimpleQuery(query.terms.first(), query.caseSensitive); if (simpleQuery) { combinedQuery->add(simpleQuery, BooleanClause::SHOULD); hasValidQuery = true; @@ -703,7 +642,7 @@ Lucene::QueryPtr FileNameIndexedStrategy::buildLuceneQuery(const IndexQuery &que break; case SearchType::Combined: if (!query.terms.isEmpty()) { - BooleanQueryPtr combinedQuery = buildBooleanTermsQuery(query, analyzer); + BooleanQueryPtr combinedQuery = buildBooleanTermsQuery(query); if (combinedQuery) { finalQuery->add(combinedQuery, BooleanClause::MUST); hasValidQuery = true; @@ -796,7 +735,7 @@ Lucene::QueryPtr FileNameIndexedStrategy::buildLuceneQuery(const IndexQuery &que return hasValidQuery ? finalQuery : nullptr; } -BooleanQueryPtr FileNameIndexedStrategy::buildBooleanTermsQuery(const IndexQuery &query, const AnalyzerPtr &analyzer) const +BooleanQueryPtr FileNameIndexedStrategy::buildBooleanTermsQuery(const IndexQuery &query) const { // 创建布尔查询 BooleanQueryPtr booleanQuery = newLucene(); @@ -808,7 +747,7 @@ BooleanQueryPtr FileNameIndexedStrategy::buildBooleanTermsQuery(const IndexQuery bool termHasQuery = false; // 添加普通关键词查询 - QueryPtr keywordQuery = m_queryBuilder->buildSimpleQuery(term, query.caseSensitive, analyzer); + QueryPtr keywordQuery = m_queryBuilder->buildSimpleQuery(term, query.caseSensitive); if (keywordQuery) { termQuery->add(keywordQuery, BooleanClause::SHOULD); termHasQuery = true; diff --git a/src/dfm-search/dfm-search-lib/filenamesearch/filenamestrategies/indexedstrategy.h b/src/dfm-search/dfm-search-lib/filenamesearch/filenamestrategies/indexedstrategy.h index c9f7c763..034ee4b8 100644 --- a/src/dfm-search/dfm-search-lib/filenamesearch/filenamestrategies/indexedstrategy.h +++ b/src/dfm-search/dfm-search-lib/filenamesearch/filenamestrategies/indexedstrategy.h @@ -89,7 +89,7 @@ class FileNameIndexedStrategy : public FileNameBaseStrategy QueryPtr buildLuceneQuery(const IndexQuery &query) const; // 构建布尔查询的辅助方法 - BooleanQueryPtr buildBooleanTermsQuery(const IndexQuery &query, const AnalyzerPtr &analyzer) const; + BooleanQueryPtr buildBooleanTermsQuery(const IndexQuery &query) const; // 处理详细搜索结果(读取所有索引字段) SearchResult processDetailedSearchResult(const QString &path, const Lucene::DocumentPtr &doc); @@ -116,14 +116,8 @@ class QueryBuilder QueryPtr buildExtQuery(const QStringList &extensions) const; QueryPtr buildPinyinQuery(const QStringList &pinyins, SearchQuery::BooleanOperator op = SearchQuery::BooleanOperator::AND) const; QueryPtr buildPinyinAcronymQuery(const QStringList &acronyms, SearchQuery::BooleanOperator op = SearchQuery::BooleanOperator::AND) const; - QueryPtr buildBooleanQuery(const QStringList &terms, bool caseSensitive, SearchQuery::BooleanOperator op, const Lucene::AnalyzerPtr &analyzer) const; - QueryPtr buildWildcardQuery(const QString &keyword, bool caseSensitive, const Lucene::AnalyzerPtr &analyzer) const; - QueryPtr buildSimpleQuery(const QString &keyword, bool caseSensitive, const Lucene::AnalyzerPtr &analyzer) const; - -private: - // 通用的查询构建方法 - QueryPtr buildCommonQuery(const QString &keyword, bool caseSensitive, const Lucene::AnalyzerPtr &analyzer, bool allowWildcard = false) const; - QueryPtr buildCommonQuery(const QString &keyword, bool caseSensitive, const Lucene::AnalyzerPtr &analyzer, const QString &fieldName, bool allowWildcard = false) const; + QueryPtr buildWildcardQuery(const QString &keyword, bool caseSensitive) const; + QueryPtr buildSimpleQuery(const QString &keyword, bool caseSensitive) const; }; /** From 3681996557ee4618bb9ac4b60c6021d24a71c8aa Mon Sep 17 00:00:00 2001 From: Zhang Sheng Date: Mon, 25 May 2026 20:32:23 +0800 Subject: [PATCH 34/36] test: add filename search engine test cases MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Added comprehensive test cases for FileNameSearchEngine functionality including: 1. Basic keyword search with indexed and realtime modes 2. Boolean AND/OR queries and wildcard pattern matching 3. File type and extension filters 4. Hidden files and excluded paths handling 5. Size and time range filters 6. Pinyin and acronym search support 7. Detailed result attributes verification 8. Error handling for invalid inputs The tests validate both indexed (Lucene-based) and realtime (filesystem scan) search modes to ensure consistent behavior across different search methods. Influence: 1. Verify all test cases pass with different combinations of search parameters 2. Test with various file types and naming patterns 3. Check behavior with hidden files and excluded directories 4. Validate time and size filter boundaries 5. Confirm detailed result attributes are accurate 6. Test error conditions like empty queries and invalid file types test: 新增文件名搜索引擎测试用例 添加了全面的文件名搜索引擎功能测试,包括: 1. 基本关键字搜索(索引模式与实时模式) 2. 布尔AND/OR查询和通配符匹配 3. 文件类型和后缀过滤 4. 隐藏文件和排除路径处理 5. 大小和时间范围过滤 6. 拼音和拼音首字母搜索支持 7. 详细结果属性验证 8. 无效输入的错误处理 这些测试覆盖了索引(基于Lucene)和实时(文件系统扫描)两种搜索模式,确保 不同搜索方法的行为一致性。 Influence: 1. 验证所有测试用例在不同搜索参数组合下都能通过 2. 使用各种文件类型和命名模式进行测试 3. 检查隐藏文件和排除目录的处理行为 4. 验证时间和大小过滤的边界条件 5. 确认详细结果属性的准确性 6. 测试空查询和无效文件类型等错误条件 --- autotests/dfm-search-tests/main.cpp | 5 + .../tst_filename_search_engine.cpp | 919 ++++++++++++++++++ 2 files changed, 924 insertions(+) create mode 100644 autotests/dfm-search-tests/tst_filename_search_engine.cpp diff --git a/autotests/dfm-search-tests/main.cpp b/autotests/dfm-search-tests/main.cpp index d1b360b1..81430b65 100644 --- a/autotests/dfm-search-tests/main.cpp +++ b/autotests/dfm-search-tests/main.cpp @@ -21,6 +21,7 @@ extern QObject *create_tst_SearchTarget(); extern QObject *create_tst_SemanticQueryBuilderTarget(); extern QObject *create_tst_ContentRetriever(); extern QObject *create_tst_ContentSearchEngine(); +extern QObject *create_tst_FileNameSearchEngine(); int main(int argc, char *argv[]) { @@ -91,5 +92,9 @@ int main(int argc, char *argv[]) result |= QTest::qExec(testObj16, argc, argv); delete testObj16; + QObject *testObj17 = create_tst_FileNameSearchEngine(); + result |= QTest::qExec(testObj17, argc, argv); + delete testObj17; + return result; } diff --git a/autotests/dfm-search-tests/tst_filename_search_engine.cpp b/autotests/dfm-search-tests/tst_filename_search_engine.cpp new file mode 100644 index 00000000..c599b6c3 --- /dev/null +++ b/autotests/dfm-search-tests/tst_filename_search_engine.cpp @@ -0,0 +1,919 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +using namespace DFMSEARCH; +using namespace Lucene; + +namespace { + +struct TestDocument +{ + QString path; + QString filename; + QString fileType; + QString fileExt; + QString pinyin; + QString pinyinAcronym; + QString hidden = "N"; + qint64 modifyTime = 1710000000; + qint64 birthTime = 1700000000; + qint64 fileSize = 1024; + QString fileSizeStr = "1 KB"; +}; + +QStringList ancestorPathsForDocument(const QString &path) +{ + QStringList ancestors; + QFileInfo info(path); + QDir dir = info.dir(); + + while (dir.exists()) { + const QString current = QDir::cleanPath(dir.absolutePath()); + ancestors.append(current); + if (!dir.cdUp()) { + break; + } + } + + ancestors.removeDuplicates(); + return ancestors; +} + +DocumentPtr buildDocument(const TestDocument &docData) +{ + DocumentPtr doc = newLucene(); + + doc->add(newLucene(LuceneFieldNames::FileName::kFullPath, docData.path.toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::FileName::kFileName, docData.filename.toStdWString(), + Field::STORE_YES, Field::INDEX_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::FileName::kFileNameLower, docData.filename.toLower().toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::FileName::kFileType, docData.fileType.toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::FileName::kFileExt, docData.fileExt.toLower().toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::FileName::kPinyin, docData.pinyin.toStdWString(), + Field::STORE_YES, Field::INDEX_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::FileName::kPinyinAcronym, docData.pinyinAcronym.toStdWString(), + Field::STORE_YES, Field::INDEX_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::FileName::kIsHidden, docData.hidden.toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + NumericFieldPtr modifyTimeField = newLucene(LuceneFieldNames::FileName::kModifyTime, + Field::STORE_YES, true); + modifyTimeField->setLongValue(docData.modifyTime); + doc->add(modifyTimeField); + + NumericFieldPtr birthTimeField = newLucene(LuceneFieldNames::FileName::kBirthTime, + Field::STORE_YES, true); + birthTimeField->setLongValue(docData.birthTime); + doc->add(birthTimeField); + + NumericFieldPtr fileSizeField = newLucene(LuceneFieldNames::FileName::kFileSize, + Field::STORE_YES, true); + fileSizeField->setLongValue(docData.fileSize); + doc->add(fileSizeField); + doc->add(newLucene(LuceneFieldNames::FileName::kFileSizeStr, docData.fileSizeStr.toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + + for (const QString &ancestor : ancestorPathsForDocument(docData.path)) { + doc->add(newLucene(LuceneFieldNames::FileName::kAncestorPaths, ancestor.toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + } + + return doc; +} + +void createFileNameIndex(const QString &indexDir, const QList &documents) +{ + QDir().mkpath(indexDir); + + IndexWriterPtr writer = newLucene( + FSDirectory::open(indexDir.toStdWString()), + newLucene(1, 2), + true, + IndexWriter::MaxFieldLengthLIMITED); + + for (const TestDocument &doc : documents) { + writer->addDocument(buildDocument(doc)); + } + + writer->close(); +} + +SearchOptions createBaseOptions(const QString &searchPath) +{ + SearchOptions options; + options.setSearchMethod(SearchMethod::Indexed); + options.setSearchPath(searchPath); + options.setSyncSearchTimeout(5); + return options; +} + +SearchOptions createRealtimeOptions(const QString &searchPath) +{ + SearchOptions options = createBaseOptions(searchPath); + options.setSearchMethod(SearchMethod::Realtime); + return options; +} + +bool createFileWithSize(const QString &path, qint64 size) +{ + QFile file(path); + if (!file.open(QIODevice::WriteOnly)) { + return false; + } + + if (size > 0) { + file.write(QByteArray(static_cast(size), 'a')); + } + + file.close(); + return true; +} + +QStringList resultPaths(const SearchResultExpected &expected) +{ + QStringList paths; + const SearchResultList results = expected.value(); + for (const SearchResult &result : results) { + paths.append(result.path()); + } + return paths; +} + +SearchQuery createWildcardQuery(const QString &pattern) +{ + SearchQuery query(pattern, SearchQuery::Type::Wildcard); + return query; +} + +} // namespace + +class tst_FileNameSearchEngine : public QObject +{ + Q_OBJECT + +private Q_SLOTS: + void search_simpleKeyword_matchesIndexedFilename(); + void search_booleanAnd_requiresAllTerms(); + void search_booleanOr_matchesAnyTerm(); + void search_wildcard_matchesByPattern(); + void search_fileTypeFilterOnly_returnsAllMatchingTypes(); + void search_extensionFilterOnly_returnsAllMatchingSuffixes(); + void search_keywordAndTypeFilter_requiresBoth(); + void search_keywordAndExtensionFilter_requiresBoth(); + void search_hiddenFiles_excludedByDefault_andIncludedWhenEnabled(); + void search_excludedPath_filtersSubtreeAtQueryLayer(); + void search_sizeAndTimeFilters_applyOnIndexedFields(); + void search_pinyinAndAcronym_queriesMatchIndexedFields(); + void search_detailedResults_populatesExtendedAttributes(); + void search_emptyKeywordWithoutFilters_returnsValidationError(); + void search_invalidFileType_returnsValidationError(); + void realtime_simpleKeyword_matchesFilesystemEntries(); + void realtime_booleanAndOr_andWildcard_queriesWork(); + void realtime_extensionFilters_areApplied(); + void realtime_hiddenAndExcludedPath_filtersWork(); + void realtime_sizeAndTimeFilters_applyWithoutIndex(); + void realtime_detailedResults_populateAttributes(); + void realtime_pinyinOption_doesNotProducePinyinMatches(); +}; + +void tst_FileNameSearchEngine::search_simpleKeyword_matchesIndexedFilename() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + const QString indexDir = tempDir.path() + "/filename-index"; + QVERIFY(QDir().mkpath(rootDir)); + + createFileNameIndex(indexDir, { + { rootDir + "/alpha-report.txt", "alpha-report.txt", "doc", "txt" }, + { rootDir + "/meeting-notes.txt", "meeting-notes.txt", "doc", "txt" }, + }); + + stub_ext::StubExt stub; + stub.set_lamda(DFMSEARCH::Global::fileNameIndexDirectory, [&indexDir]() { + return indexDir; + }); + + std::unique_ptr engine(SearchEngine::create(SearchType::FileName)); + engine->setSearchOptions(createBaseOptions(rootDir)); + + const SearchResultExpected expected = engine->searchSync(SearchQuery::createSimpleQuery("report")); + QVERIFY(expected.hasValue()); + + QCOMPARE(resultPaths(expected), QStringList { rootDir + "/alpha-report.txt" }); +} + +void tst_FileNameSearchEngine::search_booleanAnd_requiresAllTerms() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + const QString indexDir = tempDir.path() + "/filename-index"; + QVERIFY(QDir().mkpath(rootDir)); + + createFileNameIndex(indexDir, { + { rootDir + "/alpha-budget.txt", "alpha-budget.txt", "doc", "txt" }, + { rootDir + "/alpha-only.txt", "alpha-only.txt", "doc", "txt" }, + { rootDir + "/budget-only.txt", "budget-only.txt", "doc", "txt" }, + }); + + stub_ext::StubExt stub; + stub.set_lamda(DFMSEARCH::Global::fileNameIndexDirectory, [&indexDir]() { + return indexDir; + }); + + std::unique_ptr engine(SearchEngine::create(SearchType::FileName)); + engine->setSearchOptions(createBaseOptions(rootDir)); + + const SearchResultExpected expected = engine->searchSync( + SearchQuery::createBooleanQuery({ "alpha", "budget" }, SearchQuery::BooleanOperator::AND)); + QVERIFY(expected.hasValue()); + + QCOMPARE(resultPaths(expected), QStringList { rootDir + "/alpha-budget.txt" }); +} + +void tst_FileNameSearchEngine::search_booleanOr_matchesAnyTerm() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + const QString indexDir = tempDir.path() + "/filename-index"; + QVERIFY(QDir().mkpath(rootDir)); + + createFileNameIndex(indexDir, { + { rootDir + "/alpha.txt", "alpha.txt", "doc", "txt" }, + { rootDir + "/budget.xlsx", "budget.xlsx", "doc", "xlsx" }, + { rootDir + "/travel.jpg", "travel.jpg", "pic", "jpg" }, + }); + + stub_ext::StubExt stub; + stub.set_lamda(DFMSEARCH::Global::fileNameIndexDirectory, [&indexDir]() { + return indexDir; + }); + + std::unique_ptr engine(SearchEngine::create(SearchType::FileName)); + engine->setSearchOptions(createBaseOptions(rootDir)); + + const SearchResultExpected expected = engine->searchSync( + SearchQuery::createBooleanQuery({ "alpha", "budget" }, SearchQuery::BooleanOperator::OR)); + QVERIFY(expected.hasValue()); + + const QStringList paths = resultPaths(expected); + QCOMPARE(paths.size(), 2); + QVERIFY(paths.contains(rootDir + "/alpha.txt")); + QVERIFY(paths.contains(rootDir + "/budget.xlsx")); +} + +void tst_FileNameSearchEngine::search_wildcard_matchesByPattern() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + const QString indexDir = tempDir.path() + "/filename-index"; + QVERIFY(QDir().mkpath(rootDir)); + + createFileNameIndex(indexDir, { + { rootDir + "/Budget-2026.txt", "Budget-2026.txt", "doc", "txt" }, + { rootDir + "/Budget-2025.txt", "Budget-2025.txt", "doc", "txt" }, + { rootDir + "/Notes-2026.txt", "Notes-2026.txt", "doc", "txt" }, + }); + + stub_ext::StubExt stub; + stub.set_lamda(DFMSEARCH::Global::fileNameIndexDirectory, [&indexDir]() { + return indexDir; + }); + + SearchOptions options = createBaseOptions(rootDir); + options.setCaseSensitive(false); + + std::unique_ptr engine(SearchEngine::create(SearchType::FileName)); + engine->setSearchOptions(options); + + const SearchResultExpected expected = engine->searchSync(createWildcardQuery("budget-202?.txt")); + QVERIFY(expected.hasValue()); + + const QStringList paths = resultPaths(expected); + QCOMPARE(paths.size(), 2); + QVERIFY(paths.contains(rootDir + "/Budget-2026.txt")); + QVERIFY(paths.contains(rootDir + "/Budget-2025.txt")); +} + +void tst_FileNameSearchEngine::search_fileTypeFilterOnly_returnsAllMatchingTypes() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + const QString indexDir = tempDir.path() + "/filename-index"; + QVERIFY(QDir().mkpath(rootDir)); + + createFileNameIndex(indexDir, { + { rootDir + "/report.txt", "report.txt", "doc", "txt" }, + { rootDir + "/slides.pptx", "slides.pptx", "doc", "pptx" }, + { rootDir + "/holiday.jpg", "holiday.jpg", "pic", "jpg" }, + }); + + stub_ext::StubExt stub; + stub.set_lamda(DFMSEARCH::Global::fileNameIndexDirectory, [&indexDir]() { + return indexDir; + }); + + SearchOptions options = createBaseOptions(rootDir); + FileNameOptionsAPI api(options); + api.setFileTypes({ "doc" }); + + std::unique_ptr engine(SearchEngine::create(SearchType::FileName)); + engine->setSearchOptions(options); + + const SearchResultExpected expected = engine->searchSync(SearchQuery::createSimpleQuery(QString())); + QVERIFY(expected.hasValue()); + + const QStringList paths = resultPaths(expected); + QCOMPARE(paths.size(), 2); + QVERIFY(paths.contains(rootDir + "/report.txt")); + QVERIFY(paths.contains(rootDir + "/slides.pptx")); +} + +void tst_FileNameSearchEngine::search_extensionFilterOnly_returnsAllMatchingSuffixes() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + const QString indexDir = tempDir.path() + "/filename-index"; + QVERIFY(QDir().mkpath(rootDir)); + + createFileNameIndex(indexDir, { + { rootDir + "/one.txt", "one.txt", "doc", "txt" }, + { rootDir + "/two.TXT", "two.TXT", "doc", "txt" }, + { rootDir + "/three.md", "three.md", "doc", "md" }, + }); + + stub_ext::StubExt stub; + stub.set_lamda(DFMSEARCH::Global::fileNameIndexDirectory, [&indexDir]() { + return indexDir; + }); + + SearchOptions options = createBaseOptions(rootDir); + FileNameOptionsAPI api(options); + api.setFileExtensions({ "txt" }); + + std::unique_ptr engine(SearchEngine::create(SearchType::FileName)); + engine->setSearchOptions(options); + + const SearchResultExpected expected = engine->searchSync(SearchQuery::createSimpleQuery(QString())); + QVERIFY(expected.hasValue()); + + const QStringList paths = resultPaths(expected); + QCOMPARE(paths.size(), 2); + QVERIFY(paths.contains(rootDir + "/one.txt")); + QVERIFY(paths.contains(rootDir + "/two.TXT")); +} + +void tst_FileNameSearchEngine::search_keywordAndTypeFilter_requiresBoth() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + const QString indexDir = tempDir.path() + "/filename-index"; + QVERIFY(QDir().mkpath(rootDir)); + + createFileNameIndex(indexDir, { + { rootDir + "/budget.txt", "budget.txt", "doc", "txt" }, + { rootDir + "/budget.jpg", "budget.jpg", "pic", "jpg" }, + { rootDir + "/notes.txt", "notes.txt", "doc", "txt" }, + }); + + stub_ext::StubExt stub; + stub.set_lamda(DFMSEARCH::Global::fileNameIndexDirectory, [&indexDir]() { + return indexDir; + }); + + SearchOptions options = createBaseOptions(rootDir); + FileNameOptionsAPI api(options); + api.setFileTypes({ "doc" }); + + std::unique_ptr engine(SearchEngine::create(SearchType::FileName)); + engine->setSearchOptions(options); + + const SearchResultExpected expected = engine->searchSync(SearchQuery::createSimpleQuery("budget")); + QVERIFY(expected.hasValue()); + + QCOMPARE(resultPaths(expected), QStringList { rootDir + "/budget.txt" }); +} + +void tst_FileNameSearchEngine::search_keywordAndExtensionFilter_requiresBoth() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + const QString indexDir = tempDir.path() + "/filename-index"; + QVERIFY(QDir().mkpath(rootDir)); + + createFileNameIndex(indexDir, { + { rootDir + "/budget.txt", "budget.txt", "doc", "txt" }, + { rootDir + "/budget.md", "budget.md", "doc", "md" }, + { rootDir + "/summary.txt", "summary.txt", "doc", "txt" }, + }); + + stub_ext::StubExt stub; + stub.set_lamda(DFMSEARCH::Global::fileNameIndexDirectory, [&indexDir]() { + return indexDir; + }); + + SearchOptions options = createBaseOptions(rootDir); + FileNameOptionsAPI api(options); + api.setFileExtensions({ "txt" }); + + std::unique_ptr engine(SearchEngine::create(SearchType::FileName)); + engine->setSearchOptions(options); + + const SearchResultExpected expected = engine->searchSync(SearchQuery::createSimpleQuery("budget")); + QVERIFY(expected.hasValue()); + + QCOMPARE(resultPaths(expected), QStringList { rootDir + "/budget.txt" }); +} + +void tst_FileNameSearchEngine::search_hiddenFiles_excludedByDefault_andIncludedWhenEnabled() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + const QString indexDir = tempDir.path() + "/filename-index"; + QVERIFY(QDir().mkpath(rootDir)); + + createFileNameIndex(indexDir, { + { rootDir + "/visible-plan.txt", "visible-plan.txt", "doc", "txt", "", "", "N" }, + { rootDir + "/.hidden-plan.txt", ".hidden-plan.txt", "doc", "txt", "", "", "Y" }, + }); + + stub_ext::StubExt stub; + stub.set_lamda(DFMSEARCH::Global::fileNameIndexDirectory, [&indexDir]() { + return indexDir; + }); + + std::unique_ptr engine(SearchEngine::create(SearchType::FileName)); + + SearchOptions defaultOptions = createBaseOptions(rootDir); + engine->setSearchOptions(defaultOptions); + const SearchResultExpected defaultExpected = engine->searchSync(SearchQuery::createSimpleQuery("plan")); + QVERIFY(defaultExpected.hasValue()); + QCOMPARE(resultPaths(defaultExpected), QStringList { rootDir + "/visible-plan.txt" }); + + SearchOptions includeHiddenOptions = createBaseOptions(rootDir); + includeHiddenOptions.setIncludeHidden(true); + engine->setSearchOptions(includeHiddenOptions); + const SearchResultExpected includeHiddenExpected = engine->searchSync(SearchQuery::createSimpleQuery("plan")); + QVERIFY(includeHiddenExpected.hasValue()); + QCOMPARE(resultPaths(includeHiddenExpected).size(), 2); +} + +void tst_FileNameSearchEngine::search_excludedPath_filtersSubtreeAtQueryLayer() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + const QString includedDir = rootDir + "/included"; + const QString excludedDir = rootDir + "/excluded"; + const QString indexDir = tempDir.path() + "/filename-index"; + QVERIFY(QDir().mkpath(includedDir)); + QVERIFY(QDir().mkpath(excludedDir)); + + createFileNameIndex(indexDir, { + { includedDir + "/budget.txt", "budget.txt", "doc", "txt" }, + { excludedDir + "/budget.txt", "budget.txt", "doc", "txt" }, + }); + + stub_ext::StubExt stub; + stub.set_lamda(DFMSEARCH::Global::fileNameIndexDirectory, [&indexDir]() { + return indexDir; + }); + + SearchOptions options = createBaseOptions(rootDir); + options.setSearchExcludedPaths({ excludedDir }); + + std::unique_ptr engine(SearchEngine::create(SearchType::FileName)); + engine->setSearchOptions(options); + + const SearchResultExpected expected = engine->searchSync(SearchQuery::createSimpleQuery("budget")); + QVERIFY(expected.hasValue()); + + QCOMPARE(resultPaths(expected), QStringList { includedDir + "/budget.txt" }); +} + +void tst_FileNameSearchEngine::search_sizeAndTimeFilters_applyOnIndexedFields() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + const QString indexDir = tempDir.path() + "/filename-index"; + QVERIFY(QDir().mkpath(rootDir)); + + createFileNameIndex(indexDir, { + { rootDir + "/recent-large.txt", "recent-large.txt", "doc", "txt", "", "", "N", 1712000000, 1700000000, 4096, "4 KB" }, + { rootDir + "/recent-small.txt", "recent-small.txt", "doc", "txt", "", "", "N", 1712000000, 1700000000, 128, "128 B" }, + { rootDir + "/old-large.txt", "old-large.txt", "doc", "txt", "", "", "N", 1701000000, 1690000000, 4096, "4 KB" }, + }); + + stub_ext::StubExt stub; + stub.set_lamda(DFMSEARCH::Global::fileNameIndexDirectory, [&indexDir]() { + return indexDir; + }); + + SearchOptions options = createBaseOptions(rootDir); + SizeRangeFilter sizeFilter; + sizeFilter.setMin(1024); + options.setSizeRangeFilter(sizeFilter); + + TimeRangeFilter timeFilter; + timeFilter.setTimeField(TimeField::ModifyTime) + .setRange(QDateTime::fromSecsSinceEpoch(1711500000), + QDateTime::fromSecsSinceEpoch(1712500000)); + options.setTimeRangeFilter(timeFilter); + + std::unique_ptr engine(SearchEngine::create(SearchType::FileName)); + engine->setSearchOptions(options); + + const SearchResultExpected expected = engine->searchSync(SearchQuery::createSimpleQuery("recent")); + QVERIFY(expected.hasValue()); + + QCOMPARE(resultPaths(expected), QStringList { rootDir + "/recent-large.txt" }); +} + +void tst_FileNameSearchEngine::search_pinyinAndAcronym_queriesMatchIndexedFields() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + const QString indexDir = tempDir.path() + "/filename-index"; + QVERIFY(QDir().mkpath(rootDir)); + + createFileNameIndex(indexDir, { + { rootDir + "/项目计划.docx", "项目计划.docx", "doc", "docx", "xiangmujihua", "xmjh" }, + { rootDir + "/项目总结.docx", "项目总结.docx", "doc", "docx", "xiangmuzongjie", "xmzj" }, + }); + + stub_ext::StubExt stub; + stub.set_lamda(DFMSEARCH::Global::fileNameIndexDirectory, [&indexDir]() { + return indexDir; + }); + + std::unique_ptr engine(SearchEngine::create(SearchType::FileName)); + + SearchOptions pinyinOptions = createBaseOptions(rootDir); + FileNameOptionsAPI pinyinApi(pinyinOptions); + pinyinApi.setPinyinEnabled(true); + engine->setSearchOptions(pinyinOptions); + + const SearchResultExpected pinyinExpected = engine->searchSync(SearchQuery::createSimpleQuery("xiangmujihua")); + QVERIFY(pinyinExpected.hasValue()); + QCOMPARE(resultPaths(pinyinExpected), QStringList { rootDir + "/项目计划.docx" }); + + SearchOptions acronymOptions = createBaseOptions(rootDir); + FileNameOptionsAPI acronymApi(acronymOptions); + acronymApi.setPinyinAcronymEnabled(true); + engine->setSearchOptions(acronymOptions); + + const SearchResultExpected acronymExpected = engine->searchSync(SearchQuery::createSimpleQuery("xmjh")); + QVERIFY(acronymExpected.hasValue()); + QCOMPARE(resultPaths(acronymExpected), QStringList { rootDir + "/项目计划.docx" }); +} + +void tst_FileNameSearchEngine::search_detailedResults_populatesExtendedAttributes() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + const QString indexDir = tempDir.path() + "/filename-index"; + QVERIFY(QDir().mkpath(rootDir)); + + const qint64 modifyTime = 1712345678; + const qint64 birthTime = 1701234567; + createFileNameIndex(indexDir, { + { rootDir + "/archive.zip", "archive.zip", "archive", "zip", "", "", "N", modifyTime, birthTime, 2048, "2 KB" }, + }); + + stub_ext::StubExt stub; + stub.set_lamda(DFMSEARCH::Global::fileNameIndexDirectory, [&indexDir]() { + return indexDir; + }); + + SearchOptions options = createBaseOptions(rootDir); + options.setDetailedResultsEnabled(true); + + std::unique_ptr engine(SearchEngine::create(SearchType::FileName)); + engine->setSearchOptions(options); + + const SearchResultExpected expected = engine->searchSync(SearchQuery::createSimpleQuery("archive")); + QVERIFY(expected.hasValue()); + QCOMPARE(expected.value().size(), 1); + + SearchResult result = expected.value().first(); + FileNameResultAPI api(result); + QCOMPARE(result.path(), rootDir + "/archive.zip"); + QCOMPARE(api.filename(), QString("archive.zip")); + QCOMPARE(api.fileExtension(), QString("zip")); + QCOMPARE(api.fileType(), QString("archive")); + QCOMPARE(api.fileSizeBytes(), qint64(2048)); + QCOMPARE(api.size(), QString("2 KB")); + QCOMPARE(api.modifyTimestamp(), modifyTime); + QCOMPARE(api.birthTimestamp(), birthTime); + QCOMPARE(api.isDirectory(), false); + QCOMPARE(api.isHidden(), false); +} + +void tst_FileNameSearchEngine::search_emptyKeywordWithoutFilters_returnsValidationError() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + QVERIFY(QDir().mkpath(rootDir)); + + std::unique_ptr engine(SearchEngine::create(SearchType::FileName)); + engine->setSearchOptions(createBaseOptions(rootDir)); + + const SearchResultExpected expected = engine->searchSync(SearchQuery::createSimpleQuery(QString())); + QVERIFY(!expected.hasValue()); + QCOMPARE(expected.error().code().value(), static_cast(FileNameSearchErrorCode::KeywordIsEmpty)); +} + +void tst_FileNameSearchEngine::search_invalidFileType_returnsValidationError() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + QVERIFY(QDir().mkpath(rootDir)); + + SearchOptions options = createBaseOptions(rootDir); + FileNameOptionsAPI api(options); + api.setFileTypes({ "invalid-type" }); + + std::unique_ptr engine(SearchEngine::create(SearchType::FileName)); + engine->setSearchOptions(options); + + const SearchResultExpected expected = engine->searchSync(SearchQuery::createSimpleQuery("report")); + QVERIFY(!expected.hasValue()); + QCOMPARE(expected.error().code().value(), static_cast(FileNameSearchErrorCode::InvalidFileTypes)); +} + +void tst_FileNameSearchEngine::realtime_simpleKeyword_matchesFilesystemEntries() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + QVERIFY(QDir().mkpath(rootDir)); + QVERIFY(createFileWithSize(rootDir + "/alpha-report.txt", 32)); + QVERIFY(createFileWithSize(rootDir + "/meeting-notes.txt", 32)); + + std::unique_ptr engine(SearchEngine::create(SearchType::FileName)); + engine->setSearchOptions(createRealtimeOptions(rootDir)); + + const SearchResultExpected expected = engine->searchSync(SearchQuery::createSimpleQuery("report")); + QVERIFY(expected.hasValue()); + QCOMPARE(resultPaths(expected), QStringList { rootDir + "/alpha-report.txt" }); +} + +void tst_FileNameSearchEngine::realtime_booleanAndOr_andWildcard_queriesWork() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + QVERIFY(QDir().mkpath(rootDir)); + QVERIFY(createFileWithSize(rootDir + "/alpha-budget.txt", 16)); + QVERIFY(createFileWithSize(rootDir + "/alpha-only.txt", 16)); + QVERIFY(createFileWithSize(rootDir + "/budget-only.txt", 16)); + QVERIFY(createFileWithSize(rootDir + "/Budget-2026.txt", 16)); + + std::unique_ptr engine(SearchEngine::create(SearchType::FileName)); + + SearchOptions boolOptions = createRealtimeOptions(rootDir); + engine->setSearchOptions(boolOptions); + + const SearchResultExpected andExpected = engine->searchSync( + SearchQuery::createBooleanQuery({ "alpha", "budget" }, SearchQuery::BooleanOperator::AND)); + QVERIFY(andExpected.hasValue()); + QCOMPARE(resultPaths(andExpected), QStringList { rootDir + "/alpha-budget.txt" }); + + const SearchResultExpected orExpected = engine->searchSync( + SearchQuery::createBooleanQuery({ "alpha", "budget" }, SearchQuery::BooleanOperator::OR)); + QVERIFY(orExpected.hasValue()); + QCOMPARE(orExpected.value().size(), 4); + + SearchOptions wildcardOptions = createRealtimeOptions(rootDir); + wildcardOptions.setCaseSensitive(false); + engine->setSearchOptions(wildcardOptions); + + const SearchResultExpected wildcardExpected = engine->searchSync(createWildcardQuery("budget-202?.txt")); + QVERIFY(wildcardExpected.hasValue()); + QCOMPARE(resultPaths(wildcardExpected), QStringList { rootDir + "/Budget-2026.txt" }); +} + +void tst_FileNameSearchEngine::realtime_extensionFilters_areApplied() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + QVERIFY(QDir().mkpath(rootDir)); + QVERIFY(createFileWithSize(rootDir + "/report.txt", 10)); + QVERIFY(createFileWithSize(rootDir + "/slides.pptx", 10)); + QVERIFY(createFileWithSize(rootDir + "/holiday.jpg", 10)); + + std::unique_ptr engine(SearchEngine::create(SearchType::FileName)); + + SearchOptions extOptions = createRealtimeOptions(rootDir); + FileNameOptionsAPI extApi(extOptions); + extApi.setFileExtensions({ "txt", "pptx" }); + engine->setSearchOptions(extOptions); + + const SearchResultExpected extExpected = engine->searchSync(SearchQuery::createSimpleQuery(QString())); + QVERIFY(extExpected.hasValue()); + QCOMPARE(extExpected.value().size(), 2); +} + +void tst_FileNameSearchEngine::realtime_hiddenAndExcludedPath_filtersWork() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + const QString includedDir = rootDir + "/included"; + const QString excludedDir = rootDir + "/excluded"; + QVERIFY(QDir().mkpath(includedDir)); + QVERIFY(QDir().mkpath(excludedDir)); + QVERIFY(createFileWithSize(includedDir + "/visible-plan.txt", 8)); + QVERIFY(createFileWithSize(rootDir + "/.hidden-plan.txt", 8)); + QVERIFY(createFileWithSize(excludedDir + "/visible-plan.txt", 8)); + + std::unique_ptr engine(SearchEngine::create(SearchType::FileName)); + + SearchOptions defaultOptions = createRealtimeOptions(rootDir); + defaultOptions.setSearchExcludedPaths({ excludedDir }); + engine->setSearchOptions(defaultOptions); + + const SearchResultExpected defaultExpected = engine->searchSync(SearchQuery::createSimpleQuery("plan")); + QVERIFY(defaultExpected.hasValue()); + QCOMPARE(resultPaths(defaultExpected), QStringList { includedDir + "/visible-plan.txt" }); + + SearchOptions includeHiddenOptions = createRealtimeOptions(rootDir); + includeHiddenOptions.setIncludeHidden(true); + includeHiddenOptions.setSearchExcludedPaths({ excludedDir }); + engine->setSearchOptions(includeHiddenOptions); + + const SearchResultExpected includeHiddenExpected = engine->searchSync(SearchQuery::createSimpleQuery("plan")); + QVERIFY(includeHiddenExpected.hasValue()); + const QStringList paths = resultPaths(includeHiddenExpected); + QCOMPARE(paths.size(), 2); + QVERIFY(paths.contains(includedDir + "/visible-plan.txt")); + QVERIFY(paths.contains(rootDir + "/.hidden-plan.txt")); +} + +void tst_FileNameSearchEngine::realtime_sizeAndTimeFilters_applyWithoutIndex() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + QVERIFY(QDir().mkpath(rootDir)); + + const QString recentLarge = rootDir + "/recent-large.txt"; + const QString recentSmall = rootDir + "/recent-small.txt"; + const QString oldLarge = rootDir + "/old-large.txt"; + QVERIFY(createFileWithSize(recentLarge, 4096)); + QVERIFY(createFileWithSize(recentSmall, 128)); + QVERIFY(createFileWithSize(oldLarge, 4096)); + + const QDateTime recentTime = QDateTime::fromSecsSinceEpoch(1712000000); + const QDateTime oldTime = QDateTime::fromSecsSinceEpoch(1701000000); + QFile recentLargeFile(recentLarge); + QFile recentSmallFile(recentSmall); + QFile oldLargeFile(oldLarge); + QVERIFY(recentLargeFile.open(QIODevice::ReadWrite)); + QVERIFY(recentSmallFile.open(QIODevice::ReadWrite)); + QVERIFY(oldLargeFile.open(QIODevice::ReadWrite)); + QVERIFY(recentLargeFile.setFileTime(recentTime, QFileDevice::FileModificationTime)); + QVERIFY(recentSmallFile.setFileTime(recentTime, QFileDevice::FileModificationTime)); + QVERIFY(oldLargeFile.setFileTime(oldTime, QFileDevice::FileModificationTime)); + recentLargeFile.close(); + recentSmallFile.close(); + oldLargeFile.close(); + + SearchOptions options = createRealtimeOptions(rootDir); + SizeRangeFilter sizeFilter; + sizeFilter.setMin(1024); + options.setSizeRangeFilter(sizeFilter); + + TimeRangeFilter timeFilter; + timeFilter.setTimeField(TimeField::ModifyTime) + .setRange(QDateTime::fromSecsSinceEpoch(1711500000), + QDateTime::fromSecsSinceEpoch(1712500000)); + options.setTimeRangeFilter(timeFilter); + + std::unique_ptr engine(SearchEngine::create(SearchType::FileName)); + engine->setSearchOptions(options); + + const SearchResultExpected expected = engine->searchSync(SearchQuery::createSimpleQuery("recent")); + QVERIFY(expected.hasValue()); + QCOMPARE(resultPaths(expected), QStringList { recentLarge }); +} + +void tst_FileNameSearchEngine::realtime_detailedResults_populateAttributes() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + QVERIFY(QDir().mkpath(rootDir)); + const QString archivePath = rootDir + "/archive.zip"; + QVERIFY(createFileWithSize(archivePath, 2048)); + + SearchOptions options = createRealtimeOptions(rootDir); + options.setDetailedResultsEnabled(true); + + std::unique_ptr engine(SearchEngine::create(SearchType::FileName)); + engine->setSearchOptions(options); + + const SearchResultExpected expected = engine->searchSync(SearchQuery::createSimpleQuery("archive")); + QVERIFY(expected.hasValue()); + QCOMPARE(expected.value().size(), 1); + + SearchResult result = expected.value().first(); + FileNameResultAPI api(result); + QCOMPARE(result.path(), archivePath); + QCOMPARE(api.filename(), QString("archive.zip")); + QCOMPARE(api.fileExtension(), QString("zip")); + QCOMPARE(api.fileType(), QString("zip")); + QCOMPARE(api.fileSizeBytes(), qint64(2048)); + QCOMPARE(api.isDirectory(), false); + QCOMPARE(api.isHidden(), false); + QVERIFY(api.modifyTimestamp() > 0); +} + +void tst_FileNameSearchEngine::realtime_pinyinOption_doesNotProducePinyinMatches() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + QVERIFY(QDir().mkpath(rootDir)); + QVERIFY(createFileWithSize(rootDir + "/项目计划.docx", 32)); + + SearchOptions options = createRealtimeOptions(rootDir); + FileNameOptionsAPI api(options); + api.setPinyinEnabled(true); + + std::unique_ptr engine(SearchEngine::create(SearchType::FileName)); + engine->setSearchOptions(options); + + const SearchResultExpected expected = engine->searchSync(SearchQuery::createSimpleQuery("xiangmujihua")); + QVERIFY(expected.hasValue()); + QCOMPARE(expected.value().size(), 0); +} + +QObject *create_tst_FileNameSearchEngine() +{ + return new tst_FileNameSearchEngine(); +} + +#include "tst_filename_search_engine.moc" From 5b9b3792ac5b616e62adf3afc9611ef2b81d9d92 Mon Sep 17 00:00:00 2001 From: Zhang Sheng Date: Tue, 26 May 2026 11:03:33 +0800 Subject: [PATCH 35/36] docs: update license files and cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Updated .reuse/dep5 with new copyright entries for: - cpp-stub (MIT) - ELFIO (MIT) - semantic rules (GPL-3.0-or-later) 2. Removed LGPL-3.0-or-later.txt which is no longer needed 3. Added MIT.txt license file for new MIT-licensed components 4. Removed unused .gitkeep from tools directory These changes reflect updated copyright and license information for third-party components used in the project, and cleanup of unused files. Influence: 1. Verify build system continues to work with new licensing information 2. Check that all new license files are properly referenced 3. Confirm removed files were actually obsolete 4. Verify project documentation references correct licenses docs: 更新许可证文件并清理 1. 更新.reuse/dep5,新增以下版权条目: - cpp-stub (MIT) - ELFIO (MIT) - 语义规则 (GPL-3.0-or-later) 2. 删除不再需要的LGPL-3.0-or-later.txt 3. 新增MIT.txt许可证文件用于新增的MIT授权组件 4. 删除tools目录中未使用的.gitkeep文件 这些变更反映了项目中使用的第三方组件的版权和许可证信息更新,以及对未使用 文件的清理。 Influence: 1. 验证构建系统能继续工作包括新的授权信息 2. 检查所有新许可证文件是否正确引用 3. 确认被删除的文件确实不再需要 4. 验证项目文档引用了正确的许可证 --- .gitignore | 1 + .reuse/dep5 | 15 + 3rdparty/testutils/stub-ext/stub-shadow.cpp | 2 +- 3rdparty/testutils/stub-ext/stub-shadow.h | 2 +- 3rdparty/testutils/stub-ext/stubext.h | 2 +- LICENSES/LGPL-3.0-or-later.txt | 304 -------------------- LICENSES/MIT.txt | 9 + tools/.gitkeep | 0 8 files changed, 28 insertions(+), 307 deletions(-) delete mode 100644 LICENSES/LGPL-3.0-or-later.txt create mode 100644 LICENSES/MIT.txt delete mode 100644 tools/.gitkeep diff --git a/.gitignore b/.gitignore index 810d5e0c..fec08036 100644 --- a/.gitignore +++ b/.gitignore @@ -62,4 +62,5 @@ AGENTS.md .trellis .claude .agents +.codex diff --git a/.reuse/dep5 b/.reuse/dep5 index 82dcd88e..d0e93053 100644 --- a/.reuse/dep5 +++ b/.reuse/dep5 @@ -43,3 +43,18 @@ Files: src/dfm-burn/3rdparty/udfclient/* Copyright: Reinoud Zandijk License: ClArtistic +# cpp-stub (MIT) +Files: 3rdparty/testutils/cpp-stub/stub.h 3rdparty/testutils/cpp-stub/addr_any.h 3rdparty/testutils/cpp-stub/addr_pri.h +Copyright: jobczz +License: MIT + +# ELFIO (MIT) +Files: 3rdparty/testutils/cpp-stub/elfio.hpp +Copyright: Sergei Tikhomirov +License: MIT + +# semantic rules +Files: src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/*.json +Copyright: 2026 UnionTech Software Technology Co., Ltd. +License: GPL-3.0-or-later + diff --git a/3rdparty/testutils/stub-ext/stub-shadow.cpp b/3rdparty/testutils/stub-ext/stub-shadow.cpp index 05d02092..bfac8c78 100644 --- a/3rdparty/testutils/stub-ext/stub-shadow.cpp +++ b/3rdparty/testutils/stub-ext/stub-shadow.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2021 - 2023 UnionTech Software Technology Co., Ltd. +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. // // SPDX-License-Identifier: MIT diff --git a/3rdparty/testutils/stub-ext/stub-shadow.h b/3rdparty/testutils/stub-ext/stub-shadow.h index 9199a12a..e137f35a 100644 --- a/3rdparty/testutils/stub-ext/stub-shadow.h +++ b/3rdparty/testutils/stub-ext/stub-shadow.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2021 - 2023 UnionTech Software Technology Co., Ltd. +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. // // SPDX-License-Identifier: MIT diff --git a/3rdparty/testutils/stub-ext/stubext.h b/3rdparty/testutils/stub-ext/stubext.h index a31d815d..33961a7c 100644 --- a/3rdparty/testutils/stub-ext/stubext.h +++ b/3rdparty/testutils/stub-ext/stubext.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2021 - 2023 UnionTech Software Technology Co., Ltd. +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. // // SPDX-License-Identifier: MIT diff --git a/LICENSES/LGPL-3.0-or-later.txt b/LICENSES/LGPL-3.0-or-later.txt deleted file mode 100644 index 513d1c01..00000000 --- a/LICENSES/LGPL-3.0-or-later.txt +++ /dev/null @@ -1,304 +0,0 @@ -GNU LESSER GENERAL PUBLIC LICENSE -Version 3, 29 June 2007 - -Copyright (C) 2007 Free Software Foundation, Inc. - -Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. - -This version of the GNU Lesser General Public License incorporates the terms and conditions of version 3 of the GNU General Public License, supplemented by the additional permissions listed below. - -0. Additional Definitions. - -As used herein, "this License" refers to version 3 of the GNU Lesser General Public License, and the "GNU GPL" refers to version 3 of the GNU General Public License. - -"The Library" refers to a covered work governed by this License, other than an Application or a Combined Work as defined below. - -An "Application" is any work that makes use of an interface provided by the Library, but which is not otherwise based on the Library. Defining a subclass of a class defined by the Library is deemed a mode of using an interface provided by the Library. - -A "Combined Work" is a work produced by combining or linking an Application with the Library. The particular version of the Library with which the Combined Work was made is also called the "Linked Version". - -The "Minimal Corresponding Source" for a Combined Work means the Corresponding Source for the Combined Work, excluding any source code for portions of the Combined Work that, considered in isolation, are based on the Application, and not on the Linked Version. - -The "Corresponding Application Code" for a Combined Work means the object code and/or source code for the Application, including any data and utility programs needed for reproducing the Combined Work from the Application, but excluding the System Libraries of the Combined Work. - -1. Exception to Section 3 of the GNU GPL. -You may convey a covered work under sections 3 and 4 of this License without being bound by section 3 of the GNU GPL. - -2. Conveying Modified Versions. -If you modify a copy of the Library, and, in your modifications, a facility refers to a function or data to be supplied by an Application that uses the facility (other than as an argument passed when the facility is invoked), then you may convey a copy of the modified version: - - a) under this License, provided that you make a good faith effort to ensure that, in the event an Application does not supply the function or data, the facility still operates, and performs whatever part of its purpose remains meaningful, or - - b) under the GNU GPL, with none of the additional permissions of this License applicable to that copy. - -3. Object Code Incorporating Material from Library Header Files. -The object code form of an Application may incorporate material from a header file that is part of the Library. You may convey such object code under terms of your choice, provided that, if the incorporated material is not limited to numerical parameters, data structure layouts and accessors, or small macros, inline functions and templates (ten or fewer lines in length), you do both of the following: - - a) Give prominent notice with each copy of the object code that the Library is used in it and that the Library and its use are covered by this License. - - b) Accompany the object code with a copy of the GNU GPL and this license document. - -4. Combined Works. -You may convey a Combined Work under terms of your choice that, taken together, effectively do not restrict modification of the portions of the Library contained in the Combined Work and reverse engineering for debugging such modifications, if you also do each of the following: - - a) Give prominent notice with each copy of the Combined Work that the Library is used in it and that the Library and its use are covered by this License. - - b) Accompany the Combined Work with a copy of the GNU GPL and this license document. - - c) For a Combined Work that displays copyright notices during execution, include the copyright notice for the Library among these notices, as well as a reference directing the user to the copies of the GNU GPL and this license document. - - d) Do one of the following: - - 0) Convey the Minimal Corresponding Source under the terms of this License, and the Corresponding Application Code in a form suitable for, and under terms that permit, the user to recombine or relink the Application with a modified version of the Linked Version to produce a modified Combined Work, in the manner specified by section 6 of the GNU GPL for conveying Corresponding Source. - - 1) Use a suitable shared library mechanism for linking with the Library. A suitable mechanism is one that (a) uses at run time a copy of the Library already present on the user's computer system, and (b) will operate properly with a modified version of the Library that is interface-compatible with the Linked Version. - - e) Provide Installation Information, but only if you would otherwise be required to provide such information under section 6 of the GNU GPL, and only to the extent that such information is necessary to install and execute a modified version of the Combined Work produced by recombining or relinking the Application with a modified version of the Linked Version. (If you use option 4d0, the Installation Information must accompany the Minimal Corresponding Source and Corresponding Application Code. If you use option 4d1, you must provide the Installation Information in the manner specified by section 6 of the GNU GPL for conveying Corresponding Source.) - -5. Combined Libraries. -You may place library facilities that are a work based on the Library side by side in a single library together with other library facilities that are not Applications and are not covered by this License, and convey such a combined library under terms of your choice, if you do both of the following: - - a) Accompany the combined library with a copy of the same work based on the Library, uncombined with any other library facilities, conveyed under the terms of this License. - - b) Give prominent notice with the combined library that part of it is a work based on the Library, and explaining where to find the accompanying uncombined form of the same work. - -6. Revised Versions of the GNU Lesser General Public License. -The Free Software Foundation may publish revised and/or new versions of the GNU Lesser General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. - -Each version is given a distinguishing version number. If the Library as you received it specifies that a certain numbered version of the GNU Lesser General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that published version or of any later version published by the Free Software Foundation. If the Library as you received it does not specify a version number of the GNU Lesser General Public License, you may choose any version of the GNU Lesser General Public License ever published by the Free Software Foundation. - -If the Library as you received it specifies that a proxy can decide whether future versions of the GNU Lesser General Public License shall -apply, that proxy's public statement of acceptance of any version is permanent authorization for you to choose that version for the Library. - -GNU GENERAL PUBLIC LICENSE -Version 3, 29 June 2007 - -Copyright © 2007 Free Software Foundation, Inc. - -Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. - -Preamble - -The GNU General Public License is a free, copyleft license for software and other kinds of works. - -The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. We, the Free Software Foundation, use the GNU General Public License for most of our software; it applies also to any other work released this way by its authors. You can apply it to your programs, too. - -When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. - -To protect your rights, we need to prevent others from denying you these rights or asking you to surrender the rights. Therefore, you have certain responsibilities if you distribute copies of the software, or if you modify it: responsibilities to respect the freedom of others. - -For example, if you distribute copies of such a program, whether gratis or for a fee, you must pass on to the recipients the same freedoms that you received. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. - -Developers that use the GNU GPL protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License giving you legal permission to copy, distribute and/or modify it. - -For the developers' and authors' protection, the GPL clearly explains that there is no warranty for this free software. For both users' and authors' sake, the GPL requires that modified versions be marked as changed, so that their problems will not be attributed erroneously to authors of previous versions. - -Some devices are designed to deny users access to install or run modified versions of the software inside them, although the manufacturer can do so. This is fundamentally incompatible with the aim of protecting users' freedom to change the software. The systematic pattern of such abuse occurs in the area of products for individuals to use, which is precisely where it is most unacceptable. Therefore, we have designed this version of the GPL to prohibit the practice for those products. If such problems arise substantially in other domains, we stand ready to extend this provision to those domains in future versions of the GPL, as needed to protect the freedom of users. - -Finally, every program is threatened constantly by software patents. States should not allow patents to restrict development and use of software on general-purpose computers, but in those that do, we wish to avoid the special danger that patents applied to a free program could make it effectively proprietary. To prevent this, the GPL assures that patents cannot be used to render the program non-free. - -The precise terms and conditions for copying, distribution and modification follow. - -TERMS AND CONDITIONS - -0. Definitions. - -“This License” refers to version 3 of the GNU General Public License. - -“Copyright” also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. - -“The Program” refers to any copyrightable work licensed under this License. Each licensee is addressed as “you”. “Licensees” and “recipients” may be individuals or organizations. - -To “modify” a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a “modified version” of the earlier work or a work “based on” the earlier work. - -A “covered work” means either the unmodified Program or a work based on the Program. - -To “propagate” a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. - -To “convey” a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. - -An interactive user interface displays “Appropriate Legal Notices” to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. - -1. Source Code. -The “source code” for a work means the preferred form of the work for making modifications to it. “Object code” means any non-source form of a work. - -A “Standard Interface” means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. - -The “System Libraries” of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A “Major Component”, in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. - -The “Corresponding Source” for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. - -The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. - -The Corresponding Source for a work in source code form is that same work. - -2. Basic Permissions. -All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. - -You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. - -Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. - -3. Protecting Users' Legal Rights From Anti-Circumvention Law. -No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. - -When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. - -4. Conveying Verbatim Copies. -You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. - -You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. - -5. Conveying Modified Source Versions. -You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: - - a) The work must carry prominent notices stating that you modified it, and giving a relevant date. - - b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to “keep intact all notices”. - - c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. - - d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. - -A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an “aggregate” if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. - -6. Conveying Non-Source Forms. -You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: - - a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. - - b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. - - c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. - - d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. - - e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. - -A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. - -A “User Product” is either (1) a “consumer product”, which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, “normally used” refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. - -“Installation Information” for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. - -If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). - -The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. - -Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. - -7. Additional Terms. -“Additional permissions” are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. - -When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. - -Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: - - a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or - - b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or - - c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or - - d) Limiting the use for publicity purposes of names of licensors or authors of the material; or - - e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or - - f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. - -All other non-permissive additional terms are considered “further restrictions” within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. - -If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. - -Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. - -8. Termination. -You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). - -However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. - -Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. - -Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. - -9. Acceptance Not Required for Having Copies. -You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. - -10. Automatic Licensing of Downstream Recipients. -Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. - -An “entity transaction” is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. - -You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. - -11. Patents. -A “contributor” is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's “contributor version”. - -A contributor's “essential patent claims” are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, “control” includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. - -Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. - -In the following three paragraphs, a “patent license” is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To “grant” such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. - -If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. “Knowingly relying” means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. - -If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. - -A patent license is “discriminatory” if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. - -Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. - -12. No Surrender of Others' Freedom. -If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. - -13. Use with the GNU Affero General Public License. -Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU Affero General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the special requirements of the GNU Affero General Public License, section 13, concerning interaction through a network will apply to the combination as such. - -14. Revised Versions of this License. -The Free Software Foundation may publish revised and/or new versions of the GNU General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. - -Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU General Public License “or any later version” applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU General Public License, you may choose any version ever published by the Free Software Foundation. - -If the Program specifies that a proxy can decide which future versions of the GNU General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. - -Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. - -15. Disclaimer of Warranty. -THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM “AS IS” WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. - -16. Limitation of Liability. -IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. - -17. Interpretation of Sections 15 and 16. -If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. - -END OF TERMS AND CONDITIONS - -How to Apply These Terms to Your New Programs - -If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. - -To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the “copyright” line and a pointer to where the full notice is found. - - - Copyright (C) - - This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. - - This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along with this program. If not, see . - -Also add information on how to contact you by electronic and paper mail. - -If the program does terminal interaction, make it output a short notice like this when it starts in an interactive mode: - - Copyright (C) - This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. - This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. - -The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, your program's commands might be different; for a GUI interface, you would use an “about box”. - -You should also get your employer (if you work as a programmer) or school, if any, to sign a “copyright disclaimer” for the program, if necessary. For more information on this, and how to apply and follow the GNU GPL, see . - -The GNU General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. But first, please read . diff --git a/LICENSES/MIT.txt b/LICENSES/MIT.txt new file mode 100644 index 00000000..2071b23b --- /dev/null +++ b/LICENSES/MIT.txt @@ -0,0 +1,9 @@ +MIT License + +Copyright (c) + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/tools/.gitkeep b/tools/.gitkeep deleted file mode 100644 index e69de29b..00000000 From fad185858da8a7f8437a606d648b30bf13037927 Mon Sep 17 00:00:00 2001 From: Zhang Sheng Date: Thu, 28 May 2026 14:03:34 +0800 Subject: [PATCH 36/36] chore: bump version to 1.3.57 1.3.57 Log: --- debian/changelog | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/debian/changelog b/debian/changelog index 2712ec58..64e0be13 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,45 @@ +util-dfm (1.3.57) unstable; urgency=medium + + * fix: change search worker connection to direct + * feat: add filename search to content and OCR search + * feat: add natural language semantic search + * feat: add relative time support for Chinese search + * feat: implement file size range filtering + * feat: add file size constraint support in semantic search + * feat: add action-based time field search support + * fix: automatically handle hidden path search conditions + * feat: add location-based search support for Chinese NLP + * feat: add semantic query detection and multi-path search support + * feat: add file size range filter to search strategies + * fix: unify dfm-search library and path names + * feat: add file metadata attributes to search results + * fix: improve Chinese NLP search functionality + * feat: add semantic search with detailed results + * feat: enhance semantic search with explicit directories + * feat: add max results limit for semantic search + * test: add search target control tests + * feat: add chinese NLP parsing for relative time and size constraints + * feat: add NGram analyzer and tokenizer for Lucene++ + * fix: improve content search engine validation and analyzer + * refactor: optimize search filtering and query building + * feat: add on-demand content highlight retrieval + * refactor: improve NGramTokenizer and search factory + * refactor: improve OCR text search validation and analyzer selection + * perf: optimize search performance with field selector + * refactor: disable unit tests in release builds + * feat: optimize ngram search query building + * refactor: remove NGram analyzer and tokenizer components + * fix: adjust N-gram token position calculation + * feat: enhance ContentRetriever with content fetching capabilities + * test: add test utility libraries for content search + * perf: optimize OCR text search document loading + * perf: replace chinese analyzer with ngram search + * test: add filename search engine test cases + * docs: update license files and cleanup + * refactor(mount): use memfd instead of pipe for password transfer + + -- Zhang Sheng Thu, 28 May 2026 14:03:12 +0800 + util-dfm (1.3.56) unstable; urgency=medium * perf: cache resolved indexed directories to avoid repeated resolution