diff --git a/.gitignore b/.gitignore index 810d5e0c..fec08036 100644 --- a/.gitignore +++ b/.gitignore @@ -62,4 +62,5 @@ AGENTS.md .trellis .claude .agents +.codex diff --git a/.reuse/dep5 b/.reuse/dep5 index ff15d1f2..d0e93053 100644 --- a/.reuse/dep5 +++ b/.reuse/dep5 @@ -43,8 +43,18 @@ Files: src/dfm-burn/3rdparty/udfclient/* Copyright: Reinoud Zandijk License: ClArtistic -# fulltext -Files: src/dfm-search/3rdparty/fulltext/* -Copyright: 2009-2014 Alan Wright -License: LGPL-3.0-or-later +# cpp-stub (MIT) +Files: 3rdparty/testutils/cpp-stub/stub.h 3rdparty/testutils/cpp-stub/addr_any.h 3rdparty/testutils/cpp-stub/addr_pri.h +Copyright: jobczz +License: MIT + +# ELFIO (MIT) +Files: 3rdparty/testutils/cpp-stub/elfio.hpp +Copyright: Sergei Tikhomirov +License: MIT + +# semantic rules +Files: src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/*.json +Copyright: 2026 UnionTech Software Technology Co., Ltd. +License: GPL-3.0-or-later diff --git a/tools/.gitkeep b/3rdparty/.gitkeep similarity index 100% rename from tools/.gitkeep rename to 3rdparty/.gitkeep diff --git a/3rdparty/testutils/cpp-stub/addr_any.h b/3rdparty/testutils/cpp-stub/addr_any.h new file mode 100644 index 00000000..a153f348 --- /dev/null +++ b/3rdparty/testutils/cpp-stub/addr_any.h @@ -0,0 +1,280 @@ +#ifndef __ADDR_ANY_H__ +#define __ADDR_ANY_H__ + + +//linux +#include +#include +//c +#include +#include +#include + +//c++ +#include +#include +//project +#include "elfio.hpp" + + + +class AddrAny +{ +public: + AddrAny() + { + m_init = get_exe_pathname(m_fullname); + m_baseaddr = 0; + } + AddrAny(std::string libname) + { + m_init = get_lib_pathname_and_baseaddr(libname, m_fullname, m_baseaddr); + } + + int get_local_func_addr_symtab(std::string func_name_regex_str, std::map& result) + { + return get_func_addr(SHT_SYMTAB, STB_LOCAL, func_name_regex_str, result); + } + int get_global_func_addr_symtab(std::string func_name_regex_str, std::map& result) + { + return get_func_addr(SHT_SYMTAB, STB_GLOBAL, func_name_regex_str, result); + } + int get_weak_func_addr_symtab(std::string func_name_regex_str, std::map& result) + { + return get_func_addr(SHT_SYMTAB, STB_WEAK, func_name_regex_str, result); + } + + int get_global_func_addr_dynsym( std::string func_name_regex_str, std::map& result) + { + return get_func_addr(SHT_DYNSYM, STB_GLOBAL, func_name_regex_str, result); + } + int get_weak_func_addr_dynsym(std::string func_name_regex_str, std::map& result) + { + return get_func_addr(SHT_DYNSYM, STB_WEAK, func_name_regex_str, result); + } + +private: + bool demangle(std::string& s, std::string& name) { + int status; + char* pname = abi::__cxa_demangle(s.c_str(), 0, 0, &status); + if (status != 0) + { + switch(status) + { + case -1: name = "memory allocation error"; break; + case -2: name = "invalid name given"; break; + case -3: name = "internal error: __cxa_demangle: invalid argument"; break; + default: name = "unknown error occured"; break; + } + return false; + } + name = pname; + free(pname); + return true; + } + bool get_exe_pathname( std::string& name) + { + char line[512]; + FILE *fp; + uintptr_t base_addr; + char perm[5]; + unsigned long offset; + int pathname_pos; + char *pathname; + size_t pathname_len; + int match = 0; + + if(NULL == (fp = fopen("/proc/self/maps", "r"))) + { + return false; + } + + while(fgets(line, sizeof(line), fp)) + { + if(sscanf(line, "%" PRIxPTR "-%*lx %4s %lx %*x:%*x %*d%n", &base_addr, perm, &offset, &pathname_pos) != 3) continue; + + if(0 != offset) continue; + + //get pathname + while(isspace(line[pathname_pos]) && pathname_pos < (int)(sizeof(line) - 1)) + pathname_pos += 1; + if(pathname_pos >= (int)(sizeof(line) - 1)) continue; + pathname = line + pathname_pos; + pathname_len = strlen(pathname); + if(0 == pathname_len) continue; + if(pathname[pathname_len - 1] == '\n') + { + pathname[pathname_len - 1] = '\0'; + pathname_len -= 1; + } + if(0 == pathname_len) continue; + if('[' == pathname[0]) continue; + + name = pathname; + match = 1; + break; + + } + fclose(fp); + + if(0 == match) + { + return false; + } + else + { + return true; + } + + } + + bool get_lib_pathname_and_baseaddr(std::string pathname_regex_str, std::string& name, unsigned long& addr) + { + char line[512]; + FILE *fp; + uintptr_t base_addr; + char perm[5]; + unsigned long offset; + int pathname_pos; + char *pathname; + size_t pathname_len; + int match; + regex_t pathname_regex; + + regcomp(&pathname_regex, pathname_regex_str.c_str(), 0); + + if(NULL == (fp = fopen("/proc/self/maps", "r"))) + { + return false; + } + + while(fgets(line, sizeof(line), fp)) + { + if(sscanf(line, "%" PRIxPTR "-%*lx %4s %lx %*x:%*x %*d%n", &base_addr, perm, &offset, &pathname_pos) != 3) continue; + + //check permission + if(perm[0] != 'r') continue; + if(perm[3] != 'p') continue; //do not touch the shared memory + + //check offset + // + //We are trying to find ELF header in memory. + //It can only be found at the beginning of a mapped memory regions + //whose offset is 0. + if(0 != offset) continue; + + //get pathname + while(isspace(line[pathname_pos]) && pathname_pos < (int)(sizeof(line) - 1)) + pathname_pos += 1; + if(pathname_pos >= (int)(sizeof(line) - 1)) continue; + pathname = line + pathname_pos; + pathname_len = strlen(pathname); + if(0 == pathname_len) continue; + if(pathname[pathname_len - 1] == '\n') + { + pathname[pathname_len - 1] = '\0'; + pathname_len -= 1; + } + if(0 == pathname_len) continue; + if('[' == pathname[0]) continue; + + //check pathname + //if we need to hook this elf? + match = 0; + if(0 == regexec(&pathname_regex, pathname, 0, NULL, 0)) + { + match = 1; + name = pathname; + addr = (unsigned long)base_addr; + break; + } + if(0 == match) continue; + + } + fclose(fp); + if(0 == match) + { + return false; + } + else + { + return true; + } + + } + + int get_func_addr(unsigned int ttype, unsigned int stype, std::string& func_name_regex_str, std::map& result) + { + // Create an elfio reader + ELFIO::elfio reader; + int count = 0; + regex_t pathname_regex; + + if(!m_init) + { + return -1; + } + + regcomp(&pathname_regex, func_name_regex_str.c_str(), 0); + // Load ELF data + if(!reader.load(m_fullname.c_str())) + { + return -1; + } + + ELFIO::Elf_Half sec_num = reader.sections.size(); + for(int i = 0; i < sec_num; ++i) + { + ELFIO::section* psec = reader.sections[i]; + // Check section type + if(psec->get_type() == ttype) + { + const ELFIO::symbol_section_accessor symbols( reader, psec ); + for ( unsigned int j = 0; j < symbols.get_symbols_num(); ++j ) + { + std::string name; + std::string name_mangle; + ELFIO::Elf64_Addr value; + ELFIO::Elf_Xword size; + unsigned char bind; + unsigned char type; + ELFIO::Elf_Half section_index; + unsigned char other; + + // Read symbol properties + symbols.get_symbol( j, name, value, size, bind, type, section_index, other ); + if(type == STT_FUNC && bind == stype) + { + bool ret = demangle(name,name_mangle); + if(ret == true) + { + if (0 == regexec(&pathname_regex, name_mangle.c_str(), 0, NULL, 0)) + { + result.insert ( std::pair(name_mangle,(void*)(value + m_baseaddr))); + count++; + } + } + else + { + if (0 == regexec(&pathname_regex, name.c_str(), 0, NULL, 0)) + { + result.insert ( std::pair(name,(void*)(value + m_baseaddr))); + count++; + } + } + } + } + break; + } + } + + return count; + } +private: + bool m_init; + std::string m_name; + std::string m_fullname; + unsigned long m_baseaddr; + +}; +#endif diff --git a/3rdparty/testutils/cpp-stub/addr_pri.h b/3rdparty/testutils/cpp-stub/addr_pri.h new file mode 100644 index 00000000..9174bb0c --- /dev/null +++ b/3rdparty/testutils/cpp-stub/addr_pri.h @@ -0,0 +1,177 @@ +#ifndef __ADDR_PRI_H__ +#define __ADDR_PRI_H__ + + +#include +#include + + + +//base on C++11 + +/********************************************************** + access private function +**********************************************************/ + + +namespace std { + template + using enable_if_t = typename enable_if::type; + template + using remove_reference_t = typename remove_reference::type; +} // std + +// Unnamed namespace is used to avoid duplicate symbols if the macros are used +namespace { + namespace private_access_detail { + + // @tparam TagType, used to declare different "get" funciton overloads for + // different members/statics + template + struct private_access { + // Normal lookup cannot find in-class defined (inline) friend functions. + friend PtrType get(TagType) { return PtrValue; } + }; + + } // namespace private_access_detail +} // namespace + +// Used macro naming conventions: +// The "namespace" of this macro library is PRIVATE_ACCESS, i.e. all +// macro here has this prefix. +// All implementation macro, which are not meant to be used directly have the +// PRIVATE_ACCESS_DETAIL prefix. +// Some macros have the ABCD_IMPL form, which means they contain the +// implementation details for the specific ABCD macro. + +#define PRIVATE_ACCESS_DETAIL_CONCATENATE_IMPL(x, y) x##y +#define PRIVATE_ACCESS_DETAIL_CONCATENATE(x, y) \ + PRIVATE_ACCESS_DETAIL_CONCATENATE_IMPL(x, y) + +// @param PtrTypeKind E.g if we have "class A", then it can be "A::*" in case of +// members, or it can be "*" in case of statics. +#define PRIVATE_ACCESS_DETAIL_ACCESS_PRIVATE(Tag, Class, Type, Name, \ + PtrTypeKind) \ + namespace { \ + namespace private_access_detail { \ + /* Tag type, used to declare different get funcitons for different \ + * members \ + */ \ + struct Tag {}; \ + /* Explicit instantiation */ \ + template struct private_access; \ + /* We can build the PtrType only with two aliases */ \ + /* E.g. using PtrType = int(int) *; would be illformed */ \ + using PRIVATE_ACCESS_DETAIL_CONCATENATE(Alias_, Tag) = Type; \ + using PRIVATE_ACCESS_DETAIL_CONCATENATE(PtrType_, Tag) = \ + PRIVATE_ACCESS_DETAIL_CONCATENATE(Alias_, Tag) PtrTypeKind; \ + /* Declare the friend function, now it is visible in namespace scope. \ + * Note, \ + * we could declare it inside the Tag type too, in that case ADL would \ + * find \ + * the declaration. By choosing to declare it here, the Tag type remains \ + * a \ + * simple tag type, it has no other responsibilities. */ \ + PRIVATE_ACCESS_DETAIL_CONCATENATE(PtrType_, Tag) get(Tag); \ + } \ + } + +#define PRIVATE_ACCESS_DETAIL_ACCESS_PRIVATE_FIELD(Tag, Class, Type, Name) \ + PRIVATE_ACCESS_DETAIL_ACCESS_PRIVATE(Tag, Class, Type, Name, Class::*) \ + namespace { \ + namespace access_private_field { \ + Type &Class##Name(Class &&t) { return t.*get(private_access_detail::Tag{}); } \ + Type &Class##Name(Class &t) { return t.*get(private_access_detail::Tag{}); } \ + /* The following usings are here to avoid duplicate const qualifier \ + * warnings \ + */ \ + using PRIVATE_ACCESS_DETAIL_CONCATENATE(X, Tag) = Type; \ + using PRIVATE_ACCESS_DETAIL_CONCATENATE(Y, Tag) = \ + const PRIVATE_ACCESS_DETAIL_CONCATENATE(X, Tag); \ + PRIVATE_ACCESS_DETAIL_CONCATENATE(Y, Tag) & Class##Name(const Class &t) {\ + return t.*get(private_access_detail::Tag{}); \ + } \ + } \ + } + +#define PRIVATE_ACCESS_DETAIL_ACCESS_PRIVATE_FUN(Tag, Class, Type, Name) \ + PRIVATE_ACCESS_DETAIL_ACCESS_PRIVATE(Tag, Class, Type, Name, Class::*) \ + namespace { \ + namespace call_private_fun { \ + /* We do perfect forwarding, but we want to restrict the overload set \ + * only for objects which have the type Class. */ \ + template , \ + Class>::value> * = nullptr, \ + typename... Args> \ + auto Class##Name(Obj &&o, Args &&... args) -> decltype( \ + (std::forward(o).* \ + get(private_access_detail::Tag{}))(std::forward(args)...)) { \ + return (std::forward(o).*get(private_access_detail::Tag{}))( \ + std::forward(args)...); \ + } \ + } \ + namespace get_private_fun { \ + auto Class##Name() -> decltype( \ + get(private_access_detail::Tag{})) { \ + return (get(private_access_detail::Tag{})); \ + } \ + } \ + } + +#define PRIVATE_ACCESS_DETAIL_ACCESS_PRIVATE_STATIC_FIELD(Tag, Class, Type, \ + Name) \ + PRIVATE_ACCESS_DETAIL_ACCESS_PRIVATE(Tag, Class, Type, Name, *) \ + namespace { \ + namespace access_private_static_field { \ + namespace Class { \ + Type &Class##Name() { return *get(private_access_detail::Tag{}); } \ + } \ + } \ + } + +#define PRIVATE_ACCESS_DETAIL_ACCESS_PRIVATE_STATIC_FUN(Tag, Class, Type, \ + Name) \ + PRIVATE_ACCESS_DETAIL_ACCESS_PRIVATE(Tag, Class, Type, Name, *) \ + namespace { \ + namespace call_private_static_fun { \ + namespace Class { \ + template \ + auto Class##Name(Args &&... args) -> decltype( \ + get(private_access_detail::Tag{})(std::forward(args)...)) { \ + return get(private_access_detail::Tag{})( \ + std::forward(args)...); \ + } \ + } \ + } \ + namespace get_private_static_fun { \ + namespace Class { \ + auto Class##Name() -> decltype(get(private_access_detail::Tag{})) { \ + return get(private_access_detail::Tag{}); \ + } \ + } \ + } \ + } + +#define PRIVATE_ACCESS_DETAIL_UNIQUE_TAG \ + PRIVATE_ACCESS_DETAIL_CONCATENATE(PrivateAccessTag, __COUNTER__) + +#define ACCESS_PRIVATE_FIELD(Class, Type, Name) \ + PRIVATE_ACCESS_DETAIL_ACCESS_PRIVATE_FIELD(PRIVATE_ACCESS_DETAIL_UNIQUE_TAG, \ + Class, Type, Name) + +#define ACCESS_PRIVATE_FUN(Class, Type, Name) \ + PRIVATE_ACCESS_DETAIL_ACCESS_PRIVATE_FUN(PRIVATE_ACCESS_DETAIL_UNIQUE_TAG, \ + Class, Type, Name) + +#define ACCESS_PRIVATE_STATIC_FIELD(Class, Type, Name) \ + Type Class::Name; \ + PRIVATE_ACCESS_DETAIL_ACCESS_PRIVATE_STATIC_FIELD( \ + PRIVATE_ACCESS_DETAIL_UNIQUE_TAG, Class, Type, Name) + +#define ACCESS_PRIVATE_STATIC_FUN(Class, Type, Name) \ + PRIVATE_ACCESS_DETAIL_ACCESS_PRIVATE_STATIC_FUN( \ + PRIVATE_ACCESS_DETAIL_UNIQUE_TAG, Class, Type, Name) + +#endif diff --git a/3rdparty/testutils/cpp-stub/elfio.hpp b/3rdparty/testutils/cpp-stub/elfio.hpp new file mode 100644 index 00000000..dd5c9aec --- /dev/null +++ b/3rdparty/testutils/cpp-stub/elfio.hpp @@ -0,0 +1,4888 @@ + +/*** Start of inlined file: elfio_dump.hpp ***/ +#ifndef ELFIO_DUMP_HPP +#define ELFIO_DUMP_HPP + +#include +#include +#include +#include +#include + +/*** Start of inlined file: elfio.hpp ***/ +#ifndef ELFIO_HPP +#define ELFIO_HPP + +#ifdef _MSC_VER +#pragma warning( push ) +#pragma warning( disable : 4996 ) +#pragma warning( disable : 4355 ) +#pragma warning( disable : 4244 ) +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + + +/*** Start of inlined file: elf_types.hpp ***/ +#ifndef ELFTYPES_H +#define ELFTYPES_H + +#ifndef ELFIO_NO_OWN_TYPES +#if !defined( ELFIO_NO_CSTDINT ) && !defined( ELFIO_NO_INTTYPES ) +#include +#else +typedef unsigned char uint8_t; +typedef signed char int8_t; +typedef unsigned short uint16_t; +typedef signed short int16_t; +#ifdef _MSC_VER +typedef unsigned __int32 uint32_t; +typedef signed __int32 int32_t; +typedef unsigned __int64 uint64_t; +typedef signed __int64 int64_t; +#else +typedef unsigned int uint32_t; +typedef signed int int32_t; +typedef unsigned long long uint64_t; +typedef signed long long int64_t; +#endif // _MSC_VER +#endif // ELFIO_NO_CSTDINT +#endif // ELFIO_NO_OWN_TYPES + +namespace ELFIO { + +// Attention! Platform depended definitions. +typedef uint16_t Elf_Half; +typedef uint32_t Elf_Word; +typedef int32_t Elf_Sword; +typedef uint64_t Elf_Xword; +typedef int64_t Elf_Sxword; + +typedef uint32_t Elf32_Addr; +typedef uint32_t Elf32_Off; +typedef uint64_t Elf64_Addr; +typedef uint64_t Elf64_Off; + +#define Elf32_Half Elf_Half +#define Elf64_Half Elf_Half +#define Elf32_Word Elf_Word +#define Elf64_Word Elf_Word +#define Elf32_Sword Elf_Sword +#define Elf64_Sword Elf_Sword + +/////////////////////// +// ELF Header Constants + +// File type +#define ET_NONE 0 +#define ET_REL 1 +#define ET_EXEC 2 +#define ET_DYN 3 +#define ET_CORE 4 +#define ET_LOOS 0xFE00 +#define ET_HIOS 0xFEFF +#define ET_LOPROC 0xFF00 +#define ET_HIPROC 0xFFFF + +#define EM_NONE 0 // No machine +#define EM_M32 1 // AT&T WE 32100 +#define EM_SPARC 2 // SUN SPARC +#define EM_386 3 // Intel 80386 +#define EM_68K 4 // Motorola m68k family +#define EM_88K 5 // Motorola m88k family +#define EM_486 6 // Intel 80486// Reserved for future use +#define EM_860 7 // Intel 80860 +#define EM_MIPS 8 // MIPS R3000 (officially, big-endian only) +#define EM_S370 9 // IBM System/370 +#define EM_MIPS_RS3_LE \ + 10 // MIPS R3000 little-endian (Oct 4 1999 Draft) Deprecated +#define EM_res011 11 // Reserved +#define EM_res012 12 // Reserved +#define EM_res013 13 // Reserved +#define EM_res014 14 // Reserved +#define EM_PARISC 15 // HPPA +#define EM_res016 16 // Reserved +#define EM_VPP550 17 // Fujitsu VPP500 +#define EM_SPARC32PLUS 18 // Sun's "v8plus" +#define EM_960 19 // Intel 80960 +#define EM_PPC 20 // PowerPC +#define EM_PPC64 21 // 64-bit PowerPC +#define EM_S390 22 // IBM S/390 +#define EM_SPU 23 // Sony/Toshiba/IBM SPU +#define EM_res024 24 // Reserved +#define EM_res025 25 // Reserved +#define EM_res026 26 // Reserved +#define EM_res027 27 // Reserved +#define EM_res028 28 // Reserved +#define EM_res029 29 // Reserved +#define EM_res030 30 // Reserved +#define EM_res031 31 // Reserved +#define EM_res032 32 // Reserved +#define EM_res033 33 // Reserved +#define EM_res034 34 // Reserved +#define EM_res035 35 // Reserved +#define EM_V800 36 // NEC V800 series +#define EM_FR20 37 // Fujitsu FR20 +#define EM_RH32 38 // TRW RH32 +#define EM_MCORE 39 // Motorola M*Core // May also be taken by Fujitsu MMA +#define EM_RCE 39 // Old name for MCore +#define EM_ARM 40 // ARM +#define EM_OLD_ALPHA 41 // Digital Alpha +#define EM_SH 42 // Renesas (formerly Hitachi) / SuperH SH +#define EM_SPARCV9 43 // SPARC v9 64-bit +#define EM_TRICORE 44 // Siemens Tricore embedded processor +#define EM_ARC 45 // ARC Cores +#define EM_H8_300 46 // Renesas (formerly Hitachi) H8/300 +#define EM_H8_300H 47 // Renesas (formerly Hitachi) H8/300H +#define EM_H8S 48 // Renesas (formerly Hitachi) H8S +#define EM_H8_500 49 // Renesas (formerly Hitachi) H8/500 +#define EM_IA_64 50 // Intel IA-64 Processor +#define EM_MIPS_X 51 // Stanford MIPS-X +#define EM_COLDFIRE 52 // Motorola Coldfire +#define EM_68HC12 53 // Motorola M68HC12 +#define EM_MMA 54 // Fujitsu Multimedia Accelerator +#define EM_PCP 55 // Siemens PCP +#define EM_NCPU 56 // Sony nCPU embedded RISC processor +#define EM_NDR1 57 // Denso NDR1 microprocesspr +#define EM_STARCORE 58 // Motorola Star*Core processor +#define EM_ME16 59 // Toyota ME16 processor +#define EM_ST100 60 // STMicroelectronics ST100 processor +#define EM_TINYJ 61 // Advanced Logic Corp. TinyJ embedded processor +#define EM_X86_64 62 // Advanced Micro Devices X86-64 processor +#define EM_PDSP 63 // Sony DSP Processor +#define EM_PDP10 64 // Digital Equipment Corp. PDP-10 +#define EM_PDP11 65 // Digital Equipment Corp. PDP-11 +#define EM_FX66 66 // Siemens FX66 microcontroller +#define EM_ST9PLUS 67 // STMicroelectronics ST9+ 8/16 bit microcontroller +#define EM_ST7 68 // STMicroelectronics ST7 8-bit microcontroller +#define EM_68HC16 69 // Motorola MC68HC16 Microcontroller +#define EM_68HC11 70 // Motorola MC68HC11 Microcontroller +#define EM_68HC08 71 // Motorola MC68HC08 Microcontroller +#define EM_68HC05 72 // Motorola MC68HC05 Microcontroller +#define EM_SVX 73 // Silicon Graphics SVx +#define EM_ST19 74 // STMicroelectronics ST19 8-bit cpu +#define EM_VAX 75 // Digital VAX +#define EM_CRIS 76 // Axis Communications 32-bit embedded processor +#define EM_JAVELIN 77 // Infineon Technologies 32-bit embedded cpu +#define EM_FIREPATH 78 // Element 14 64-bit DSP processor +#define EM_ZSP 79 // LSI Logic's 16-bit DSP processor +#define EM_MMIX 80 // Donald Knuth's educational 64-bit processor +#define EM_HUANY 81 // Harvard's machine-independent format +#define EM_PRISM 82 // SiTera Prism +#define EM_AVR 83 // Atmel AVR 8-bit microcontroller +#define EM_FR30 84 // Fujitsu FR30 +#define EM_D10V 85 // Mitsubishi D10V +#define EM_D30V 86 // Mitsubishi D30V +#define EM_V850 87 // NEC v850 +#define EM_M32R 88 // Renesas M32R (formerly Mitsubishi M32R) +#define EM_MN10300 89 // Matsushita MN10300 +#define EM_MN10200 90 // Matsushita MN10200 +#define EM_PJ 91 // picoJava +#define EM_OPENRISC 92 // OpenRISC 32-bit embedded processor +#define EM_ARC_A5 93 // ARC Cores Tangent-A5 +#define EM_XTENSA 94 // Tensilica Xtensa Architecture +#define EM_VIDEOCORE 95 // Alphamosaic VideoCore processor +#define EM_TMM_GPP 96 // Thompson Multimedia General Purpose Processor +#define EM_NS32K 97 // National Semiconductor 32000 series +#define EM_TPC 98 // Tenor Network TPC processor +#define EM_SNP1K 99 // Trebia SNP 1000 processor +#define EM_ST200 100 // STMicroelectronics ST200 microcontroller +#define EM_IP2K 101 // Ubicom IP2022 micro controller +#define EM_MAX 102 // MAX Processor +#define EM_CR 103 // National Semiconductor CompactRISC +#define EM_F2MC16 104 // Fujitsu F2MC16 +#define EM_MSP430 105 // TI msp430 micro controller +#define EM_BLACKFIN 106 // ADI Blackfin +#define EM_SE_C33 107 // S1C33 Family of Seiko Epson processors +#define EM_SEP 108 // Sharp embedded microprocessor +#define EM_ARCA 109 // Arca RISC Microprocessor +#define EM_UNICORE \ + 110 // Microprocessor series from PKU-Unity Ltd. and MPRC of Peking University +#define EM_EXCESS 111 // eXcess: 16/32/64-bit configurable embedded CPU +#define EM_DXP 112 // Icera Semiconductor Inc. Deep Execution Processor +#define EM_ALTERA_NIOS2 113 // Altera Nios II soft-core processor +#define EM_CRX 114 // National Semiconductor CRX +#define EM_XGATE 115 // Motorola XGATE embedded processor +#define EM_C166 116 // Infineon C16x/XC16x processor +#define EM_M16C 117 // Renesas M16C series microprocessors +#define EM_DSPIC30F \ + 118 // Microchip Technology dsPIC30F Digital Signal Controller +#define EM_CE 119 // Freescale Communication Engine RISC core +#define EM_M32C 120 // Renesas M32C series microprocessors +#define EM_res121 121 // Reserved +#define EM_res122 122 // Reserved +#define EM_res123 123 // Reserved +#define EM_res124 124 // Reserved +#define EM_res125 125 // Reserved +#define EM_res126 126 // Reserved +#define EM_res127 127 // Reserved +#define EM_res128 128 // Reserved +#define EM_res129 129 // Reserved +#define EM_res130 130 // Reserved +#define EM_TSK3000 131 // Altium TSK3000 core +#define EM_RS08 132 // Freescale RS08 embedded processor +#define EM_res133 133 // Reserved +#define EM_ECOG2 134 // Cyan Technology eCOG2 microprocessor +#define EM_SCORE 135 // Sunplus Score +#define EM_SCORE7 135 // Sunplus S+core7 RISC processor +#define EM_DSP24 136 // New Japan Radio (NJR) 24-bit DSP Processor +#define EM_VIDEOCORE3 137 // Broadcom VideoCore III processor +#define EM_LATTICEMICO32 138 // RISC processor for Lattice FPGA architecture +#define EM_SE_C17 139 // Seiko Epson C17 family +#define EM_TI_C6000 140 // Texas Instruments TMS320C6000 DSP family +#define EM_TI_C2000 141 // Texas Instruments TMS320C2000 DSP family +#define EM_TI_C5500 142 // Texas Instruments TMS320C55x DSP family +#define EM_res143 143 // Reserved +#define EM_res144 144 // Reserved +#define EM_res145 145 // Reserved +#define EM_res146 146 // Reserved +#define EM_res147 147 // Reserved +#define EM_res148 148 // Reserved +#define EM_res149 149 // Reserved +#define EM_res150 150 // Reserved +#define EM_res151 151 // Reserved +#define EM_res152 152 // Reserved +#define EM_res153 153 // Reserved +#define EM_res154 154 // Reserved +#define EM_res155 155 // Reserved +#define EM_res156 156 // Reserved +#define EM_res157 157 // Reserved +#define EM_res158 158 // Reserved +#define EM_res159 159 // Reserved +#define EM_MMDSP_PLUS 160 // STMicroelectronics 64bit VLIW Data Signal Processor +#define EM_CYPRESS_M8C 161 // Cypress M8C microprocessor +#define EM_R32C 162 // Renesas R32C series microprocessors +#define EM_TRIMEDIA 163 // NXP Semiconductors TriMedia architecture family +#define EM_QDSP6 164 // QUALCOMM DSP6 Processor +#define EM_8051 165 // Intel 8051 and variants +#define EM_STXP7X 166 // STMicroelectronics STxP7x family +#define EM_NDS32 \ + 167 // Andes Technology compact code size embedded RISC processor family +#define EM_ECOG1 168 // Cyan Technology eCOG1X family +#define EM_ECOG1X 168 // Cyan Technology eCOG1X family +#define EM_MAXQ30 169 // Dallas Semiconductor MAXQ30 Core Micro-controllers +#define EM_XIMO16 170 // New Japan Radio (NJR) 16-bit DSP Processor +#define EM_MANIK 171 // M2000 Reconfigurable RISC Microprocessor +#define EM_CRAYNV2 172 // Cray Inc. NV2 vector architecture +#define EM_RX 173 // Renesas RX family +#define EM_METAG 174 // Imagination Technologies META processor architecture +#define EM_MCST_ELBRUS 175 // MCST Elbrus general purpose hardware architecture +#define EM_ECOG16 176 // Cyan Technology eCOG16 family +#define EM_CR16 177 // National Semiconductor CompactRISC 16-bit processor +#define EM_ETPU 178 // Freescale Extended Time Processing Unit +#define EM_SLE9X 179 // Infineon Technologies SLE9X core +#define EM_L1OM 180 // Intel L1OM +#define EM_INTEL181 181 // Reserved by Intel +#define EM_INTEL182 182 // Reserved by Intel +#define EM_res183 183 // Reserved by ARM +#define EM_res184 184 // Reserved by ARM +#define EM_AVR32 185 // Atmel Corporation 32-bit microprocessor family +#define EM_STM8 186 // STMicroeletronics STM8 8-bit microcontroller +#define EM_TILE64 187 // Tilera TILE64 multicore architecture family +#define EM_TILEPRO 188 // Tilera TILEPro multicore architecture family +#define EM_MICROBLAZE 189 // Xilinx MicroBlaze 32-bit RISC soft processor core +#define EM_CUDA 190 // NVIDIA CUDA architecture +#define EM_TILEGX 191 // Tilera TILE-Gx multicore architecture family +#define EM_CLOUDSHIELD 192 // CloudShield architecture family +#define EM_COREA_1ST 193 // KIPO-KAIST Core-A 1st generation processor family +#define EM_COREA_2ND 194 // KIPO-KAIST Core-A 2nd generation processor family +#define EM_ARC_COMPACT2 195 // Synopsys ARCompact V2 +#define EM_OPEN8 196 // Open8 8-bit RISC soft processor core +#define EM_RL78 197 // Renesas RL78 family +#define EM_VIDEOCORE5 198 // Broadcom VideoCore V processor +#define EM_78KOR 199 // Renesas 78KOR family +#define EM_56800EX 200 // Freescale 56800EX Digital Signal Controller (DSC) +#define EM_BA1 201 // Beyond BA1 CPU architecture +#define EM_BA2 202 // Beyond BA2 CPU architecture +#define EM_XCORE 203 // XMOS xCORE processor family +#define EM_MCHP_PIC 204 // Microchip 8-bit PIC(r) family +#define EM_INTEL205 205 // Reserved by Intel +#define EM_INTEL206 206 // Reserved by Intel +#define EM_INTEL207 207 // Reserved by Intel +#define EM_INTEL208 208 // Reserved by Intel +#define EM_INTEL209 209 // Reserved by Intel +#define EM_KM32 210 // KM211 KM32 32-bit processor +#define EM_KMX32 211 // KM211 KMX32 32-bit processor +#define EM_KMX16 212 // KM211 KMX16 16-bit processor +#define EM_KMX8 213 // KM211 KMX8 8-bit processor +#define EM_KVARC 214 // KM211 KVARC processor +#define EM_CDP 215 // Paneve CDP architecture family +#define EM_COGE 216 // Cognitive Smart Memory Processor +#define EM_COOL 217 // iCelero CoolEngine +#define EM_NORC 218 // Nanoradio Optimized RISC +#define EM_CSR_KALIMBA 219 // CSR Kalimba architecture family +#define EM_Z80 220 // Zilog Z80 +#define EM_VISIUM 221 // Controls and Data Services VISIUMcore processor +#define EM_FT32 222 // FTDI Chip FT32 high performance 32-bit RISC architecture +#define EM_MOXIE 223 // Moxie processor family +#define EM_AMDGPU 224 // AMD GPU architecture +#define EM_RISCV 243 // RISC-V +#define EM_LANAI 244 // Lanai processor +#define EM_CEVA 245 // CEVA Processor Architecture Family +#define EM_CEVA_X2 246 // CEVA X2 Processor Family +#define EM_BPF 247 // Linux BPF – in-kernel virtual machine +#define EM_GRAPHCORE_IPU 248 // Graphcore Intelligent Processing Unit +#define EM_IMG1 249 // Imagination Technologies +#define EM_NFP 250 // Netronome Flow Processor (P) +#define EM_CSKY 252 // C-SKY processor family + +// File version +#define EV_NONE 0 +#define EV_CURRENT 1 + +// Identification index +#define EI_MAG0 0 +#define EI_MAG1 1 +#define EI_MAG2 2 +#define EI_MAG3 3 +#define EI_CLASS 4 +#define EI_DATA 5 +#define EI_VERSION 6 +#define EI_OSABI 7 +#define EI_ABIVERSION 8 +#define EI_PAD 9 +#define EI_NIDENT 16 + +// Magic number +#define ELFMAG0 0x7F +#define ELFMAG1 'E' +#define ELFMAG2 'L' +#define ELFMAG3 'F' + +// File class +#define ELFCLASSNONE 0 +#define ELFCLASS32 1 +#define ELFCLASS64 2 + +// Encoding +#define ELFDATANONE 0 +#define ELFDATA2LSB 1 +#define ELFDATA2MSB 2 + +// OS extensions +#define ELFOSABI_NONE 0 // No extensions or unspecified +#define ELFOSABI_HPUX 1 // Hewlett-Packard HP-UX +#define ELFOSABI_NETBSD 2 // NetBSD +#define ELFOSABI_LINUX 3 // Linux +#define ELFOSABI_SOLARIS 6 // Sun Solaris +#define ELFOSABI_AIX 7 // AIX +#define ELFOSABI_IRIX 8 // IRIX +#define ELFOSABI_FREEBSD 9 // FreeBSD +#define ELFOSABI_TRU64 10 // Compaq TRU64 UNIX +#define ELFOSABI_MODESTO 11 // Novell Modesto +#define ELFOSABI_OPENBSD 12 // Open BSD +#define ELFOSABI_OPENVMS 13 // Open VMS +#define ELFOSABI_NSK 14 // Hewlett-Packard Non-Stop Kernel +#define ELFOSABI_AROS 15 // Amiga Research OS +#define ELFOSABI_FENIXOS 16 // The FenixOS highly scalable multi-core OS +// 64-255 Architecture-specific value range +#define ELFOSABI_AMDGPU_HSA \ + 64 // AMDGPU OS for HSA compatible compute // kernels. +#define ELFOSABI_AMDGPU_PAL \ + 65 // AMDGPU OS for AMD PAL compatible graphics // shaders and compute kernels. +#define ELFOSABI_AMDGPU_MESA3D \ + 66 // AMDGPU OS for Mesa3D compatible graphics // shaders and compute kernels. + +// AMDGPU specific e_flags +#define EF_AMDGPU_MACH 0x0ff // AMDGPU processor selection mask. +#define EF_AMDGPU_XNACK \ + 0x100 // Indicates if the XNACK target feature is // enabled for all code contained in the ELF. +// AMDGPU processors +#define EF_AMDGPU_MACH_NONE 0x000 // Unspecified processor. +#define EF_AMDGPU_MACH_R600_R600 0x001 +#define EF_AMDGPU_MACH_R600_R630 0x002 +#define EF_AMDGPU_MACH_R600_RS880 0x003 +#define EF_AMDGPU_MACH_R600_RV670 0x004 +#define EF_AMDGPU_MACH_R600_RV710 0x005 +#define EF_AMDGPU_MACH_R600_RV730 0x006 +#define EF_AMDGPU_MACH_R600_RV770 0x007 +#define EF_AMDGPU_MACH_R600_CEDAR 0x008 +#define EF_AMDGPU_MACH_R600_CYPRESS 0x009 +#define EF_AMDGPU_MACH_R600_JUNIPER 0x00a +#define EF_AMDGPU_MACH_R600_REDWOOD 0x00b +#define EF_AMDGPU_MACH_R600_SUMO 0x00c +#define EF_AMDGPU_MACH_R600_BARTS 0x00d +#define EF_AMDGPU_MACH_R600_CAICOS 0x00e +#define EF_AMDGPU_MACH_R600_CAYMAN 0x00f +#define EF_AMDGPU_MACH_R600_TURKS 0x010 +#define EF_AMDGPU_MACH_R600_RESERVED_FIRST 0x011 +#define EF_AMDGPU_MACH_R600_RESERVED_LAST 0x01f +#define EF_AMDGPU_MACH_R600_FIRST EF_AMDGPU_MACH_R600_R600 +#define EF_AMDGPU_MACH_R600_LAST EF_AMDGPU_MACH_R600_TURKS +#define EF_AMDGPU_MACH_AMDGCN_GFX600 0x020 +#define EF_AMDGPU_MACH_AMDGCN_GFX601 0x021 +#define EF_AMDGPU_MACH_AMDGCN_GFX700 0x022 +#define EF_AMDGPU_MACH_AMDGCN_GFX701 0x023 +#define EF_AMDGPU_MACH_AMDGCN_GFX702 0x024 +#define EF_AMDGPU_MACH_AMDGCN_GFX703 0x025 +#define EF_AMDGPU_MACH_AMDGCN_GFX704 0x026 +#define EF_AMDGPU_MACH_AMDGCN_GFX801 0x028 +#define EF_AMDGPU_MACH_AMDGCN_GFX802 0x029 +#define EF_AMDGPU_MACH_AMDGCN_GFX803 0x02a +#define EF_AMDGPU_MACH_AMDGCN_GFX810 0x02b +#define EF_AMDGPU_MACH_AMDGCN_GFX900 0x02c +#define EF_AMDGPU_MACH_AMDGCN_GFX902 0x02d +#define EF_AMDGPU_MACH_AMDGCN_GFX904 0x02e +#define EF_AMDGPU_MACH_AMDGCN_GFX906 0x02f +#define EF_AMDGPU_MACH_AMDGCN_RESERVED0 0x027 +#define EF_AMDGPU_MACH_AMDGCN_RESERVED1 0x030 +#define EF_AMDGPU_MACH_AMDGCN_FIRST EF_AMDGPU_MACH_AMDGCN_GFX600 +#define EF_AMDGPU_MACH_AMDGCN_LAST EF_AMDGPU_MACH_AMDGCN_GFX906 + +///////////////////// +// Sections constants + +// Section indexes +#define SHN_UNDEF 0 +#define SHN_LORESERVE 0xFF00 +#define SHN_LOPROC 0xFF00 +#define SHN_HIPROC 0xFF1F +#define SHN_LOOS 0xFF20 +#define SHN_HIOS 0xFF3F +#define SHN_ABS 0xFFF1 +#define SHN_COMMON 0xFFF2 +#define SHN_XINDEX 0xFFFF +#define SHN_HIRESERVE 0xFFFF + +// Section types +#define SHT_NULL 0 +#define SHT_PROGBITS 1 +#define SHT_SYMTAB 2 +#define SHT_STRTAB 3 +#define SHT_RELA 4 +#define SHT_HASH 5 +#define SHT_DYNAMIC 6 +#define SHT_NOTE 7 +#define SHT_NOBITS 8 +#define SHT_REL 9 +#define SHT_SHLIB 10 +#define SHT_DYNSYM 11 +#define SHT_INIT_ARRAY 14 +#define SHT_FINI_ARRAY 15 +#define SHT_PREINIT_ARRAY 16 +#define SHT_GROUP 17 +#define SHT_SYMTAB_SHNDX 18 +#define SHT_LOOS 0x60000000 +#define SHT_HIOS 0x6fffffff +#define SHT_LOPROC 0x70000000 +#define SHT_HIPROC 0x7FFFFFFF +#define SHT_LOUSER 0x80000000 +#define SHT_HIUSER 0xFFFFFFFF + +// Section attribute flags +#define SHF_WRITE 0x1 +#define SHF_ALLOC 0x2 +#define SHF_EXECINSTR 0x4 +#define SHF_MERGE 0x10 +#define SHF_STRINGS 0x20 +#define SHF_INFO_LINK 0x40 +#define SHF_LINK_ORDER 0x80 +#define SHF_OS_NONCONFORMING 0x100 +#define SHF_GROUP 0x200 +#define SHF_TLS 0x400 +#define SHF_MASKOS 0x0ff00000 +#define SHF_MASKPROC 0xF0000000 + +// Section group flags +#define GRP_COMDAT 0x1 +#define GRP_MASKOS 0x0ff00000 +#define GRP_MASKPROC 0xf0000000 + +// Symbol binding +#define STB_LOCAL 0 +#define STB_GLOBAL 1 +#define STB_WEAK 2 +#define STB_LOOS 10 +#define STB_HIOS 12 +#define STB_MULTIDEF 13 +#define STB_LOPROC 13 +#define STB_HIPROC 15 + +// Note types +#define NT_AMDGPU_METADATA 1 +#define NT_AMD_AMDGPU_HSA_METADATA 10 +#define NT_AMD_AMDGPU_ISA 11 +#define NT_AMD_AMDGPU_PAL_METADATA 12 + +// Symbol types +#define STT_NOTYPE 0 +#define STT_OBJECT 1 +#define STT_FUNC 2 +#define STT_SECTION 3 +#define STT_FILE 4 +#define STT_COMMON 5 +#define STT_TLS 6 +#define STT_LOOS 10 +#define STT_AMDGPU_HSA_KERNEL 10 +#define STT_HIOS 12 +#define STT_LOPROC 13 +#define STT_HIPROC 15 + +// Symbol visibility +#define STV_DEFAULT 0 +#define STV_INTERNAL 1 +#define STV_HIDDEN 2 +#define STV_PROTECTED 3 + +// Undefined name +#define STN_UNDEF 0 + +// Relocation types +#define R_386_NONE 0 +#define R_X86_64_NONE 0 +#define R_AMDGPU_NONE 0 +#define R_386_32 1 +#define R_X86_64_64 1 +#define R_AMDGPU_ABS32_LO 1 +#define R_386_PC32 2 +#define R_X86_64_PC32 2 +#define R_AMDGPU_ABS32_HI 2 +#define R_386_GOT32 3 +#define R_X86_64_GOT32 3 +#define R_AMDGPU_ABS64 3 +#define R_386_PLT32 4 +#define R_X86_64_PLT32 4 +#define R_AMDGPU_REL32 4 +#define R_386_COPY 5 +#define R_X86_64_COPY 5 +#define R_AMDGPU_REL64 5 +#define R_386_GLOB_DAT 6 +#define R_X86_64_GLOB_DAT 6 +#define R_AMDGPU_ABS32 6 +#define R_386_JMP_SLOT 7 +#define R_X86_64_JUMP_SLOT 7 +#define R_AMDGPU_GOTPCREL 7 +#define R_386_RELATIVE 8 +#define R_X86_64_RELATIVE 8 +#define R_AMDGPU_GOTPCREL32_LO 8 +#define R_386_GOTOFF 9 +#define R_X86_64_GOTPCREL 9 +#define R_AMDGPU_GOTPCREL32_HI 9 +#define R_386_GOTPC 10 +#define R_X86_64_32 10 +#define R_AMDGPU_REL32_LO 10 +#define R_386_32PLT 11 +#define R_X86_64_32S 11 +#define R_AMDGPU_REL32_HI 11 +#define R_X86_64_16 12 +#define R_X86_64_PC16 13 +#define R_AMDGPU_RELATIVE64 13 +#define R_386_TLS_TPOFF 14 +#define R_X86_64_8 14 +#define R_386_TLS_IE 15 +#define R_X86_64_PC8 15 +#define R_386_TLS_GOTIE 16 +#define R_X86_64_DTPMOD64 16 +#define R_386_TLS_LE 17 +#define R_X86_64_DTPOFF64 17 +#define R_386_TLS_GD 18 +#define R_X86_64_TPOFF64 18 +#define R_386_TLS_LDM 19 +#define R_X86_64_TLSGD 19 +#define R_386_16 20 +#define R_X86_64_TLSLD 20 +#define R_386_PC16 21 +#define R_X86_64_DTPOFF32 21 +#define R_386_8 22 +#define R_X86_64_GOTTPOFF 22 +#define R_386_PC8 23 +#define R_X86_64_TPOFF32 23 +#define R_386_TLS_GD_32 24 +#define R_X86_64_PC64 24 +#define R_386_TLS_GD_PUSH 25 +#define R_X86_64_GOTOFF64 25 +#define R_386_TLS_GD_CALL 26 +#define R_X86_64_GOTPC32 26 +#define R_386_TLS_GD_POP 27 +#define R_X86_64_GOT64 27 +#define R_386_TLS_LDM_32 28 +#define R_X86_64_GOTPCREL64 28 +#define R_386_TLS_LDM_PUSH 29 +#define R_X86_64_GOTPC64 29 +#define R_386_TLS_LDM_CALL 30 +#define R_X86_64_GOTPLT64 30 +#define R_386_TLS_LDM_POP 31 +#define R_X86_64_PLTOFF64 31 +#define R_386_TLS_LDO_32 32 +#define R_386_TLS_IE_32 33 +#define R_386_TLS_LE_32 34 +#define R_X86_64_GOTPC32_TLSDESC 34 +#define R_386_TLS_DTPMOD32 35 +#define R_X86_64_TLSDESC_CALL 35 +#define R_386_TLS_DTPOFF32 36 +#define R_X86_64_TLSDESC 36 +#define R_386_TLS_TPOFF32 37 +#define R_X86_64_IRELATIVE 37 +#define R_386_SIZE32 38 +#define R_386_TLS_GOTDESC 39 +#define R_386_TLS_DESC_CALL 40 +#define R_386_TLS_DESC 41 +#define R_386_IRELATIVE 42 +#define R_386_GOT32X 43 +#define R_X86_64_GNU_VTINHERIT 250 +#define R_X86_64_GNU_VTENTRY 251 + +// Segment types +#define PT_NULL 0 +#define PT_LOAD 1 +#define PT_DYNAMIC 2 +#define PT_INTERP 3 +#define PT_NOTE 4 +#define PT_SHLIB 5 +#define PT_PHDR 6 +#define PT_TLS 7 +#define PT_LOOS 0x60000000 +#define PT_HIOS 0x6fffffff +#define PT_LOPROC 0x70000000 +#define PT_HIPROC 0x7FFFFFFF + +// Segment flags +#define PF_X 1 // Execute +#define PF_W 2 // Write +#define PF_R 4 // Read +#define PF_MASKOS 0x0ff00000 // Unspecified +#define PF_MASKPROC 0xf0000000 // Unspecified + +// Dynamic Array Tags +#define DT_NULL 0 +#define DT_NEEDED 1 +#define DT_PLTRELSZ 2 +#define DT_PLTGOT 3 +#define DT_HASH 4 +#define DT_STRTAB 5 +#define DT_SYMTAB 6 +#define DT_RELA 7 +#define DT_RELASZ 8 +#define DT_RELAENT 9 +#define DT_STRSZ 10 +#define DT_SYMENT 11 +#define DT_INIT 12 +#define DT_FINI 13 +#define DT_SONAME 14 +#define DT_RPATH 15 +#define DT_SYMBOLIC 16 +#define DT_REL 17 +#define DT_RELSZ 18 +#define DT_RELENT 19 +#define DT_PLTREL 20 +#define DT_DEBUG 21 +#define DT_TEXTREL 22 +#define DT_JMPREL 23 +#define DT_BIND_NOW 24 +#define DT_INIT_ARRAY 25 +#define DT_FINI_ARRAY 26 +#define DT_INIT_ARRAYSZ 27 +#define DT_FINI_ARRAYSZ 28 +#define DT_RUNPATH 29 +#define DT_FLAGS 30 +#define DT_ENCODING 32 +#define DT_PREINIT_ARRAY 32 +#define DT_PREINIT_ARRAYSZ 33 +#define DT_MAXPOSTAGS 34 +#define DT_LOOS 0x6000000D +#define DT_HIOS 0x6ffff000 +#define DT_LOPROC 0x70000000 +#define DT_HIPROC 0x7FFFFFFF + +// DT_FLAGS values +#define DF_ORIGIN 0x1 +#define DF_SYMBOLIC 0x2 +#define DF_TEXTREL 0x4 +#define DF_BIND_NOW 0x8 +#define DF_STATIC_TLS 0x10 + +// ELF file header +struct Elf32_Ehdr +{ + unsigned char e_ident[EI_NIDENT]; + Elf_Half e_type; + Elf_Half e_machine; + Elf_Word e_version; + Elf32_Addr e_entry; + Elf32_Off e_phoff; + Elf32_Off e_shoff; + Elf_Word e_flags; + Elf_Half e_ehsize; + Elf_Half e_phentsize; + Elf_Half e_phnum; + Elf_Half e_shentsize; + Elf_Half e_shnum; + Elf_Half e_shstrndx; +}; + +struct Elf64_Ehdr +{ + unsigned char e_ident[EI_NIDENT]; + Elf_Half e_type; + Elf_Half e_machine; + Elf_Word e_version; + Elf64_Addr e_entry; + Elf64_Off e_phoff; + Elf64_Off e_shoff; + Elf_Word e_flags; + Elf_Half e_ehsize; + Elf_Half e_phentsize; + Elf_Half e_phnum; + Elf_Half e_shentsize; + Elf_Half e_shnum; + Elf_Half e_shstrndx; +}; + +// Section header +struct Elf32_Shdr +{ + Elf_Word sh_name; + Elf_Word sh_type; + Elf_Word sh_flags; + Elf32_Addr sh_addr; + Elf32_Off sh_offset; + Elf_Word sh_size; + Elf_Word sh_link; + Elf_Word sh_info; + Elf_Word sh_addralign; + Elf_Word sh_entsize; +}; + +struct Elf64_Shdr +{ + Elf_Word sh_name; + Elf_Word sh_type; + Elf_Xword sh_flags; + Elf64_Addr sh_addr; + Elf64_Off sh_offset; + Elf_Xword sh_size; + Elf_Word sh_link; + Elf_Word sh_info; + Elf_Xword sh_addralign; + Elf_Xword sh_entsize; +}; + +// Segment header +struct Elf32_Phdr +{ + Elf_Word p_type; + Elf32_Off p_offset; + Elf32_Addr p_vaddr; + Elf32_Addr p_paddr; + Elf_Word p_filesz; + Elf_Word p_memsz; + Elf_Word p_flags; + Elf_Word p_align; +}; + +struct Elf64_Phdr +{ + Elf_Word p_type; + Elf_Word p_flags; + Elf64_Off p_offset; + Elf64_Addr p_vaddr; + Elf64_Addr p_paddr; + Elf_Xword p_filesz; + Elf_Xword p_memsz; + Elf_Xword p_align; +}; + +// Symbol table entry +struct Elf32_Sym +{ + Elf_Word st_name; + Elf32_Addr st_value; + Elf_Word st_size; + unsigned char st_info; + unsigned char st_other; + Elf_Half st_shndx; +}; + +struct Elf64_Sym +{ + Elf_Word st_name; + unsigned char st_info; + unsigned char st_other; + Elf_Half st_shndx; + Elf64_Addr st_value; + Elf_Xword st_size; +}; + +#define ELF_ST_BIND( i ) ( ( i ) >> 4 ) +#define ELF_ST_TYPE( i ) ( (i)&0xf ) +#define ELF_ST_INFO( b, t ) ( ( ( b ) << 4 ) + ( (t)&0xf ) ) + +#define ELF_ST_VISIBILITY( o ) ( (o)&0x3 ) + +// Relocation entries +struct Elf32_Rel +{ + Elf32_Addr r_offset; + Elf_Word r_info; +}; + +struct Elf32_Rela +{ + Elf32_Addr r_offset; + Elf_Word r_info; + Elf_Sword r_addend; +}; + +struct Elf64_Rel +{ + Elf64_Addr r_offset; + Elf_Xword r_info; +}; + +struct Elf64_Rela +{ + Elf64_Addr r_offset; + Elf_Xword r_info; + Elf_Sxword r_addend; +}; + +#define ELF32_R_SYM( i ) ( ( i ) >> 8 ) +#define ELF32_R_TYPE( i ) ( (unsigned char)( i ) ) +#define ELF32_R_INFO( s, t ) ( ( ( s ) << 8 ) + (unsigned char)( t ) ) + +#define ELF64_R_SYM( i ) ( ( i ) >> 32 ) +#define ELF64_R_TYPE( i ) ( (i)&0xffffffffL ) +#define ELF64_R_INFO( s, t ) \ + ( ( ( ( int64_t )( s ) ) << 32 ) + ( (t)&0xffffffffL ) ) + +// Dynamic structure +struct Elf32_Dyn +{ + Elf_Sword d_tag; + union { + Elf_Word d_val; + Elf32_Addr d_ptr; + } d_un; +}; + +struct Elf64_Dyn +{ + Elf_Sxword d_tag; + union { + Elf_Xword d_val; + Elf64_Addr d_ptr; + } d_un; +}; + +} // namespace ELFIO + +#endif // ELFTYPES_H + +/*** End of inlined file: elf_types.hpp ***/ + + +/*** Start of inlined file: elfio_version.hpp ***/ +#define ELFIO_VERSION "3.8" + +/*** End of inlined file: elfio_version.hpp ***/ + + +/*** Start of inlined file: elfio_utils.hpp ***/ +#ifndef ELFIO_UTILS_HPP +#define ELFIO_UTILS_HPP + +#define ELFIO_GET_ACCESS( TYPE, NAME, FIELD ) \ + TYPE get_##NAME() const { return ( *convertor )( FIELD ); } +#define ELFIO_SET_ACCESS( TYPE, NAME, FIELD ) \ + void set_##NAME( TYPE value ) \ + { \ + FIELD = value; \ + FIELD = ( *convertor )( FIELD ); \ + } +#define ELFIO_GET_SET_ACCESS( TYPE, NAME, FIELD ) \ + TYPE get_##NAME() const { return ( *convertor )( FIELD ); } \ + void set_##NAME( TYPE value ) \ + { \ + FIELD = value; \ + FIELD = ( *convertor )( FIELD ); \ + } + +#define ELFIO_GET_ACCESS_DECL( TYPE, NAME ) virtual TYPE get_##NAME() const = 0 + +#define ELFIO_SET_ACCESS_DECL( TYPE, NAME ) \ + virtual void set_##NAME( TYPE value ) = 0 + +#define ELFIO_GET_SET_ACCESS_DECL( TYPE, NAME ) \ + virtual TYPE get_##NAME() const = 0; \ + virtual void set_##NAME( TYPE value ) = 0 + +namespace ELFIO { + +//------------------------------------------------------------------------------ +class endianess_convertor +{ + public: + //------------------------------------------------------------------------------ + endianess_convertor() { need_conversion = false; } + + //------------------------------------------------------------------------------ + void setup( unsigned char elf_file_encoding ) + { + need_conversion = ( elf_file_encoding != get_host_encoding() ); + } + + //------------------------------------------------------------------------------ + uint64_t operator()( uint64_t value ) const + { + if ( !need_conversion ) { + return value; + } + value = ( ( value & 0x00000000000000FFull ) << 56 ) | + ( ( value & 0x000000000000FF00ull ) << 40 ) | + ( ( value & 0x0000000000FF0000ull ) << 24 ) | + ( ( value & 0x00000000FF000000ull ) << 8 ) | + ( ( value & 0x000000FF00000000ull ) >> 8 ) | + ( ( value & 0x0000FF0000000000ull ) >> 24 ) | + ( ( value & 0x00FF000000000000ull ) >> 40 ) | + ( ( value & 0xFF00000000000000ull ) >> 56 ); + + return value; + } + + //------------------------------------------------------------------------------ + int64_t operator()( int64_t value ) const + { + if ( !need_conversion ) { + return value; + } + return ( int64_t )( *this )( (uint64_t)value ); + } + + //------------------------------------------------------------------------------ + uint32_t operator()( uint32_t value ) const + { + if ( !need_conversion ) { + return value; + } + value = + ( ( value & 0x000000FF ) << 24 ) | ( ( value & 0x0000FF00 ) << 8 ) | + ( ( value & 0x00FF0000 ) >> 8 ) | ( ( value & 0xFF000000 ) >> 24 ); + + return value; + } + + //------------------------------------------------------------------------------ + int32_t operator()( int32_t value ) const + { + if ( !need_conversion ) { + return value; + } + return ( int32_t )( *this )( (uint32_t)value ); + } + + //------------------------------------------------------------------------------ + uint16_t operator()( uint16_t value ) const + { + if ( !need_conversion ) { + return value; + } + value = ( ( value & 0x00FF ) << 8 ) | ( ( value & 0xFF00 ) >> 8 ); + + return value; + } + + //------------------------------------------------------------------------------ + int16_t operator()( int16_t value ) const + { + if ( !need_conversion ) { + return value; + } + return ( int16_t )( *this )( (uint16_t)value ); + } + + //------------------------------------------------------------------------------ + int8_t operator()( int8_t value ) const { return value; } + + //------------------------------------------------------------------------------ + uint8_t operator()( uint8_t value ) const { return value; } + + //------------------------------------------------------------------------------ + private: + //------------------------------------------------------------------------------ + unsigned char get_host_encoding() const + { + static const int tmp = 1; + if ( 1 == *(const char*)&tmp ) { + return ELFDATA2LSB; + } + else { + return ELFDATA2MSB; + } + } + + //------------------------------------------------------------------------------ + private: + bool need_conversion; +}; + +//------------------------------------------------------------------------------ +inline uint32_t elf_hash( const unsigned char* name ) +{ + uint32_t h = 0, g; + while ( *name ) { + h = ( h << 4 ) + *name++; + g = h & 0xf0000000; + if ( g != 0 ) + h ^= g >> 24; + h &= ~g; + } + return h; +} + +} // namespace ELFIO + +#endif // ELFIO_UTILS_HPP + +/*** End of inlined file: elfio_utils.hpp ***/ + + +/*** Start of inlined file: elfio_header.hpp ***/ +#ifndef ELF_HEADER_HPP +#define ELF_HEADER_HPP + +#include + +namespace ELFIO { + +class elf_header +{ + public: + virtual ~elf_header(){}; + virtual bool load( std::istream& stream ) = 0; + virtual bool save( std::ostream& stream ) const = 0; + + // ELF header functions + ELFIO_GET_ACCESS_DECL( unsigned char, class ); + ELFIO_GET_ACCESS_DECL( unsigned char, elf_version ); + ELFIO_GET_ACCESS_DECL( unsigned char, encoding ); + ELFIO_GET_ACCESS_DECL( Elf_Half, header_size ); + ELFIO_GET_ACCESS_DECL( Elf_Half, section_entry_size ); + ELFIO_GET_ACCESS_DECL( Elf_Half, segment_entry_size ); + + ELFIO_GET_SET_ACCESS_DECL( Elf_Word, version ); + ELFIO_GET_SET_ACCESS_DECL( unsigned char, os_abi ); + ELFIO_GET_SET_ACCESS_DECL( unsigned char, abi_version ); + ELFIO_GET_SET_ACCESS_DECL( Elf_Half, type ); + ELFIO_GET_SET_ACCESS_DECL( Elf_Half, machine ); + ELFIO_GET_SET_ACCESS_DECL( Elf_Word, flags ); + ELFIO_GET_SET_ACCESS_DECL( Elf64_Addr, entry ); + ELFIO_GET_SET_ACCESS_DECL( Elf_Half, sections_num ); + ELFIO_GET_SET_ACCESS_DECL( Elf64_Off, sections_offset ); + ELFIO_GET_SET_ACCESS_DECL( Elf_Half, segments_num ); + ELFIO_GET_SET_ACCESS_DECL( Elf64_Off, segments_offset ); + ELFIO_GET_SET_ACCESS_DECL( Elf_Half, section_name_str_index ); +}; + +template struct elf_header_impl_types; +template <> struct elf_header_impl_types +{ + typedef Elf32_Phdr Phdr_type; + typedef Elf32_Shdr Shdr_type; + static const unsigned char file_class = ELFCLASS32; +}; +template <> struct elf_header_impl_types +{ + typedef Elf64_Phdr Phdr_type; + typedef Elf64_Shdr Shdr_type; + static const unsigned char file_class = ELFCLASS64; +}; + +template class elf_header_impl : public elf_header +{ + public: + //------------------------------------------------------------------------------ + elf_header_impl( endianess_convertor* convertor_, unsigned char encoding ) + { + convertor = convertor_; + + std::fill_n( reinterpret_cast( &header ), sizeof( header ), + '\0' ); + + header.e_ident[EI_MAG0] = ELFMAG0; + header.e_ident[EI_MAG1] = ELFMAG1; + header.e_ident[EI_MAG2] = ELFMAG2; + header.e_ident[EI_MAG3] = ELFMAG3; + header.e_ident[EI_CLASS] = elf_header_impl_types::file_class; + header.e_ident[EI_DATA] = encoding; + header.e_ident[EI_VERSION] = EV_CURRENT; + header.e_version = ( *convertor )( (Elf_Word)EV_CURRENT ); + header.e_ehsize = ( sizeof( header ) ); + header.e_ehsize = ( *convertor )( header.e_ehsize ); + header.e_shstrndx = ( *convertor )( (Elf_Half)1 ); + header.e_phentsize = + sizeof( typename elf_header_impl_types::Phdr_type ); + header.e_shentsize = + sizeof( typename elf_header_impl_types::Shdr_type ); + header.e_phentsize = ( *convertor )( header.e_phentsize ); + header.e_shentsize = ( *convertor )( header.e_shentsize ); + } + + //------------------------------------------------------------------------------ + bool load( std::istream& stream ) + { + stream.seekg( 0 ); + stream.read( reinterpret_cast( &header ), sizeof( header ) ); + + return ( stream.gcount() == sizeof( header ) ); + } + + //------------------------------------------------------------------------------ + bool save( std::ostream& stream ) const + { + stream.seekp( 0 ); + stream.write( reinterpret_cast( &header ), + sizeof( header ) ); + + return stream.good(); + } + + //------------------------------------------------------------------------------ + // ELF header functions + ELFIO_GET_ACCESS( unsigned char, class, header.e_ident[EI_CLASS] ); + ELFIO_GET_ACCESS( unsigned char, elf_version, header.e_ident[EI_VERSION] ); + ELFIO_GET_ACCESS( unsigned char, encoding, header.e_ident[EI_DATA] ); + ELFIO_GET_ACCESS( Elf_Half, header_size, header.e_ehsize ); + ELFIO_GET_ACCESS( Elf_Half, section_entry_size, header.e_shentsize ); + ELFIO_GET_ACCESS( Elf_Half, segment_entry_size, header.e_phentsize ); + + ELFIO_GET_SET_ACCESS( Elf_Word, version, header.e_version ); + ELFIO_GET_SET_ACCESS( unsigned char, os_abi, header.e_ident[EI_OSABI] ); + ELFIO_GET_SET_ACCESS( unsigned char, + abi_version, + header.e_ident[EI_ABIVERSION] ); + ELFIO_GET_SET_ACCESS( Elf_Half, type, header.e_type ); + ELFIO_GET_SET_ACCESS( Elf_Half, machine, header.e_machine ); + ELFIO_GET_SET_ACCESS( Elf_Word, flags, header.e_flags ); + ELFIO_GET_SET_ACCESS( Elf_Half, section_name_str_index, header.e_shstrndx ); + ELFIO_GET_SET_ACCESS( Elf64_Addr, entry, header.e_entry ); + ELFIO_GET_SET_ACCESS( Elf_Half, sections_num, header.e_shnum ); + ELFIO_GET_SET_ACCESS( Elf64_Off, sections_offset, header.e_shoff ); + ELFIO_GET_SET_ACCESS( Elf_Half, segments_num, header.e_phnum ); + ELFIO_GET_SET_ACCESS( Elf64_Off, segments_offset, header.e_phoff ); + + private: + T header; + endianess_convertor* convertor; +}; + +} // namespace ELFIO + +#endif // ELF_HEADER_HPP + +/*** End of inlined file: elfio_header.hpp ***/ + + +/*** Start of inlined file: elfio_section.hpp ***/ +#ifndef ELFIO_SECTION_HPP +#define ELFIO_SECTION_HPP + +#include +#include +#include + +namespace ELFIO { + +class section +{ + friend class elfio; + + public: + virtual ~section(){}; + + ELFIO_GET_ACCESS_DECL( Elf_Half, index ); + ELFIO_GET_SET_ACCESS_DECL( std::string, name ); + ELFIO_GET_SET_ACCESS_DECL( Elf_Word, type ); + ELFIO_GET_SET_ACCESS_DECL( Elf_Xword, flags ); + ELFIO_GET_SET_ACCESS_DECL( Elf_Word, info ); + ELFIO_GET_SET_ACCESS_DECL( Elf_Word, link ); + ELFIO_GET_SET_ACCESS_DECL( Elf_Xword, addr_align ); + ELFIO_GET_SET_ACCESS_DECL( Elf_Xword, entry_size ); + ELFIO_GET_SET_ACCESS_DECL( Elf64_Addr, address ); + ELFIO_GET_SET_ACCESS_DECL( Elf_Xword, size ); + ELFIO_GET_SET_ACCESS_DECL( Elf_Word, name_string_offset ); + ELFIO_GET_ACCESS_DECL( Elf64_Off, offset ); + + virtual const char* get_data() const = 0; + virtual void set_data( const char* pData, Elf_Word size ) = 0; + virtual void set_data( const std::string& data ) = 0; + virtual void append_data( const char* pData, Elf_Word size ) = 0; + virtual void append_data( const std::string& data ) = 0; + virtual size_t get_stream_size() const = 0; + virtual void set_stream_size( size_t value ) = 0; + + protected: + ELFIO_SET_ACCESS_DECL( Elf64_Off, offset ); + ELFIO_SET_ACCESS_DECL( Elf_Half, index ); + + virtual void load( std::istream& stream, std::streampos header_offset ) = 0; + virtual void save( std::ostream& stream, + std::streampos header_offset, + std::streampos data_offset ) = 0; + virtual bool is_address_initialized() const = 0; +}; + +template class section_impl : public section +{ + public: + //------------------------------------------------------------------------------ + section_impl( const endianess_convertor* convertor_ ) + : convertor( convertor_ ) + { + std::fill_n( reinterpret_cast( &header ), sizeof( header ), + '\0' ); + is_address_set = false; + data = 0; + data_size = 0; + index = 0; + stream_size = 0; + } + + //------------------------------------------------------------------------------ + ~section_impl() { delete[] data; } + + //------------------------------------------------------------------------------ + // Section info functions + ELFIO_GET_SET_ACCESS( Elf_Word, type, header.sh_type ); + ELFIO_GET_SET_ACCESS( Elf_Xword, flags, header.sh_flags ); + ELFIO_GET_SET_ACCESS( Elf_Xword, size, header.sh_size ); + ELFIO_GET_SET_ACCESS( Elf_Word, link, header.sh_link ); + ELFIO_GET_SET_ACCESS( Elf_Word, info, header.sh_info ); + ELFIO_GET_SET_ACCESS( Elf_Xword, addr_align, header.sh_addralign ); + ELFIO_GET_SET_ACCESS( Elf_Xword, entry_size, header.sh_entsize ); + ELFIO_GET_SET_ACCESS( Elf_Word, name_string_offset, header.sh_name ); + ELFIO_GET_ACCESS( Elf64_Addr, address, header.sh_addr ); + + //------------------------------------------------------------------------------ + Elf_Half get_index() const { return index; } + + //------------------------------------------------------------------------------ + std::string get_name() const { return name; } + + //------------------------------------------------------------------------------ + void set_name( std::string name_ ) { name = name_; } + + //------------------------------------------------------------------------------ + void set_address( Elf64_Addr value ) + { + header.sh_addr = value; + header.sh_addr = ( *convertor )( header.sh_addr ); + is_address_set = true; + } + + //------------------------------------------------------------------------------ + bool is_address_initialized() const { return is_address_set; } + + //------------------------------------------------------------------------------ + const char* get_data() const { return data; } + + //------------------------------------------------------------------------------ + void set_data( const char* raw_data, Elf_Word size ) + { + if ( get_type() != SHT_NOBITS ) { + delete[] data; + data = new ( std::nothrow ) char[size]; + if ( 0 != data && 0 != raw_data ) { + data_size = size; + std::copy( raw_data, raw_data + size, data ); + } + else { + data_size = 0; + } + } + + set_size( data_size ); + } + + //------------------------------------------------------------------------------ + void set_data( const std::string& str_data ) + { + return set_data( str_data.c_str(), (Elf_Word)str_data.size() ); + } + + //------------------------------------------------------------------------------ + void append_data( const char* raw_data, Elf_Word size ) + { + if ( get_type() != SHT_NOBITS ) { + if ( get_size() + size < data_size ) { + std::copy( raw_data, raw_data + size, data + get_size() ); + } + else { + data_size = 2 * ( data_size + size ); + char* new_data = new ( std::nothrow ) char[data_size]; + + if ( 0 != new_data ) { + std::copy( data, data + get_size(), new_data ); + std::copy( raw_data, raw_data + size, + new_data + get_size() ); + delete[] data; + data = new_data; + } + else { + size = 0; + } + } + set_size( get_size() + size ); + } + } + + //------------------------------------------------------------------------------ + void append_data( const std::string& str_data ) + { + return append_data( str_data.c_str(), (Elf_Word)str_data.size() ); + } + + //------------------------------------------------------------------------------ + protected: + //------------------------------------------------------------------------------ + ELFIO_GET_SET_ACCESS( Elf64_Off, offset, header.sh_offset ); + + //------------------------------------------------------------------------------ + void set_index( Elf_Half value ) { index = value; } + + //------------------------------------------------------------------------------ + void load( std::istream& stream, std::streampos header_offset ) + { + std::fill_n( reinterpret_cast( &header ), sizeof( header ), + '\0' ); + + stream.seekg( 0, stream.end ); + set_stream_size( stream.tellg() ); + + stream.seekg( header_offset ); + stream.read( reinterpret_cast( &header ), sizeof( header ) ); + + Elf_Xword size = get_size(); + if ( 0 == data && SHT_NULL != get_type() && SHT_NOBITS != get_type() && + size < get_stream_size() ) { + data = new ( std::nothrow ) char[size + 1]; + + if ( ( 0 != size ) && ( 0 != data ) ) { + stream.seekg( ( *convertor )( header.sh_offset ) ); + stream.read( data, size ); + data[size] = 0; // Ensure data is ended with 0 to avoid oob read + data_size = size; + } + else { + data_size = 0; + } + } + } + + //------------------------------------------------------------------------------ + void save( std::ostream& stream, + std::streampos header_offset, + std::streampos data_offset ) + { + if ( 0 != get_index() ) { + header.sh_offset = data_offset; + header.sh_offset = ( *convertor )( header.sh_offset ); + } + + save_header( stream, header_offset ); + if ( get_type() != SHT_NOBITS && get_type() != SHT_NULL && + get_size() != 0 && data != 0 ) { + save_data( stream, data_offset ); + } + } + + //------------------------------------------------------------------------------ + private: + //------------------------------------------------------------------------------ + void save_header( std::ostream& stream, std::streampos header_offset ) const + { + stream.seekp( header_offset ); + stream.write( reinterpret_cast( &header ), + sizeof( header ) ); + } + + //------------------------------------------------------------------------------ + void save_data( std::ostream& stream, std::streampos data_offset ) const + { + stream.seekp( data_offset ); + stream.write( get_data(), get_size() ); + } + + //------------------------------------------------------------------------------ + size_t get_stream_size() const { return stream_size; } + + //------------------------------------------------------------------------------ + void set_stream_size( size_t value ) { stream_size = value; } + + //------------------------------------------------------------------------------ + private: + T header; + Elf_Half index; + std::string name; + char* data; + Elf_Word data_size; + const endianess_convertor* convertor; + bool is_address_set; + size_t stream_size; +}; + +} // namespace ELFIO + +#endif // ELFIO_SECTION_HPP + +/*** End of inlined file: elfio_section.hpp ***/ + + +/*** Start of inlined file: elfio_segment.hpp ***/ +#ifndef ELFIO_SEGMENT_HPP +#define ELFIO_SEGMENT_HPP + +#include +#include +#include + +namespace ELFIO { + +class segment +{ + friend class elfio; + + public: + virtual ~segment(){}; + + ELFIO_GET_ACCESS_DECL( Elf_Half, index ); + ELFIO_GET_SET_ACCESS_DECL( Elf_Word, type ); + ELFIO_GET_SET_ACCESS_DECL( Elf_Word, flags ); + ELFIO_GET_SET_ACCESS_DECL( Elf_Xword, align ); + ELFIO_GET_SET_ACCESS_DECL( Elf64_Addr, virtual_address ); + ELFIO_GET_SET_ACCESS_DECL( Elf64_Addr, physical_address ); + ELFIO_GET_SET_ACCESS_DECL( Elf_Xword, file_size ); + ELFIO_GET_SET_ACCESS_DECL( Elf_Xword, memory_size ); + ELFIO_GET_ACCESS_DECL( Elf64_Off, offset ); + + virtual const char* get_data() const = 0; + + virtual Elf_Half add_section_index( Elf_Half index, + Elf_Xword addr_align ) = 0; + virtual Elf_Half get_sections_num() const = 0; + virtual Elf_Half get_section_index_at( Elf_Half num ) const = 0; + virtual bool is_offset_initialized() const = 0; + + protected: + ELFIO_SET_ACCESS_DECL( Elf64_Off, offset ); + ELFIO_SET_ACCESS_DECL( Elf_Half, index ); + + virtual const std::vector& get_sections() const = 0; + virtual void load( std::istream& stream, std::streampos header_offset ) = 0; + virtual void save( std::ostream& stream, + std::streampos header_offset, + std::streampos data_offset ) = 0; +}; + +//------------------------------------------------------------------------------ +template class segment_impl : public segment +{ + public: + //------------------------------------------------------------------------------ + segment_impl( endianess_convertor* convertor_ ) + : stream_size( 0 ), index( 0 ), data( 0 ), convertor( convertor_ ) + { + is_offset_set = false; + std::fill_n( reinterpret_cast( &ph ), sizeof( ph ), '\0' ); + } + + //------------------------------------------------------------------------------ + virtual ~segment_impl() { delete[] data; } + + //------------------------------------------------------------------------------ + // Section info functions + ELFIO_GET_SET_ACCESS( Elf_Word, type, ph.p_type ); + ELFIO_GET_SET_ACCESS( Elf_Word, flags, ph.p_flags ); + ELFIO_GET_SET_ACCESS( Elf_Xword, align, ph.p_align ); + ELFIO_GET_SET_ACCESS( Elf64_Addr, virtual_address, ph.p_vaddr ); + ELFIO_GET_SET_ACCESS( Elf64_Addr, physical_address, ph.p_paddr ); + ELFIO_GET_SET_ACCESS( Elf_Xword, file_size, ph.p_filesz ); + ELFIO_GET_SET_ACCESS( Elf_Xword, memory_size, ph.p_memsz ); + ELFIO_GET_ACCESS( Elf64_Off, offset, ph.p_offset ); + size_t stream_size; + + //------------------------------------------------------------------------------ + size_t get_stream_size() const { return stream_size; } + + //------------------------------------------------------------------------------ + void set_stream_size( size_t value ) { stream_size = value; } + + //------------------------------------------------------------------------------ + Elf_Half get_index() const { return index; } + + //------------------------------------------------------------------------------ + const char* get_data() const { return data; } + + //------------------------------------------------------------------------------ + Elf_Half add_section_index( Elf_Half sec_index, Elf_Xword addr_align ) + { + sections.push_back( sec_index ); + if ( addr_align > get_align() ) { + set_align( addr_align ); + } + + return (Elf_Half)sections.size(); + } + + //------------------------------------------------------------------------------ + Elf_Half get_sections_num() const { return (Elf_Half)sections.size(); } + + //------------------------------------------------------------------------------ + Elf_Half get_section_index_at( Elf_Half num ) const + { + if ( num < sections.size() ) { + return sections[num]; + } + + return Elf_Half( -1 ); + } + + //------------------------------------------------------------------------------ + protected: + //------------------------------------------------------------------------------ + + //------------------------------------------------------------------------------ + void set_offset( Elf64_Off value ) + { + ph.p_offset = value; + ph.p_offset = ( *convertor )( ph.p_offset ); + is_offset_set = true; + } + + //------------------------------------------------------------------------------ + bool is_offset_initialized() const { return is_offset_set; } + + //------------------------------------------------------------------------------ + const std::vector& get_sections() const { return sections; } + + //------------------------------------------------------------------------------ + void set_index( Elf_Half value ) { index = value; } + + //------------------------------------------------------------------------------ + void load( std::istream& stream, std::streampos header_offset ) + { + + stream.seekg( 0, stream.end ); + set_stream_size( stream.tellg() ); + + stream.seekg( header_offset ); + stream.read( reinterpret_cast( &ph ), sizeof( ph ) ); + is_offset_set = true; + + if ( PT_NULL != get_type() && 0 != get_file_size() ) { + stream.seekg( ( *convertor )( ph.p_offset ) ); + Elf_Xword size = get_file_size(); + + if ( size > get_stream_size() ) { + data = 0; + } + else { + data = new (std::nothrow) char[size + 1]; + + if ( 0 != data ) { + stream.read( data, size ); + data[size] = 0; + } + } + } + } + + //------------------------------------------------------------------------------ + void save( std::ostream& stream, + std::streampos header_offset, + std::streampos data_offset ) + { + ph.p_offset = data_offset; + ph.p_offset = ( *convertor )( ph.p_offset ); + stream.seekp( header_offset ); + stream.write( reinterpret_cast( &ph ), sizeof( ph ) ); + } + + //------------------------------------------------------------------------------ + private: + T ph; + Elf_Half index; + char* data; + std::vector sections; + endianess_convertor* convertor; + bool is_offset_set; +}; + +} // namespace ELFIO + +#endif // ELFIO_SEGMENT_HPP + +/*** End of inlined file: elfio_segment.hpp ***/ + + +/*** Start of inlined file: elfio_strings.hpp ***/ +#ifndef ELFIO_STRINGS_HPP +#define ELFIO_STRINGS_HPP + +#include +#include +#include + +namespace ELFIO { + +//------------------------------------------------------------------------------ +template class string_section_accessor_template +{ + public: + //------------------------------------------------------------------------------ + string_section_accessor_template( S* section_ ) : string_section( section_ ) + { + } + + //------------------------------------------------------------------------------ + const char* get_string( Elf_Word index ) const + { + if ( string_section ) { + if ( index < string_section->get_size() ) { + const char* data = string_section->get_data(); + if ( 0 != data ) { + return data + index; + } + } + } + + return 0; + } + + //------------------------------------------------------------------------------ + Elf_Word add_string( const char* str ) + { + Elf_Word current_position = 0; + + if ( string_section ) { + // Strings are addeded to the end of the current section data + current_position = (Elf_Word)string_section->get_size(); + + if ( current_position == 0 ) { + char empty_string = '\0'; + string_section->append_data( &empty_string, 1 ); + current_position++; + } + string_section->append_data( str, + (Elf_Word)std::strlen( str ) + 1 ); + } + + return current_position; + } + + //------------------------------------------------------------------------------ + Elf_Word add_string( const std::string& str ) + { + return add_string( str.c_str() ); + } + + //------------------------------------------------------------------------------ + private: + S* string_section; +}; + +using string_section_accessor = string_section_accessor_template
; +using const_string_section_accessor = + string_section_accessor_template; + +} // namespace ELFIO + +#endif // ELFIO_STRINGS_HPP + +/*** End of inlined file: elfio_strings.hpp ***/ + +#define ELFIO_HEADER_ACCESS_GET( TYPE, FNAME ) \ + TYPE get_##FNAME() const { return header ? ( header->get_##FNAME() ) : 0; } + +#define ELFIO_HEADER_ACCESS_GET_SET( TYPE, FNAME ) \ + TYPE get_##FNAME() const \ + { \ + return header ? ( header->get_##FNAME() ) : 0; \ + } \ + void set_##FNAME( TYPE val ) \ + { \ + if ( header ) { \ + header->set_##FNAME( val ); \ + } \ + } + +namespace ELFIO { + +//------------------------------------------------------------------------------ +class elfio +{ + public: + //------------------------------------------------------------------------------ + elfio() : sections( this ), segments( this ) + { + header = 0; + current_file_pos = 0; + create( ELFCLASS32, ELFDATA2LSB ); + } + + //------------------------------------------------------------------------------ + ~elfio() { clean(); } + + //------------------------------------------------------------------------------ + void create( unsigned char file_class, unsigned char encoding ) + { + clean(); + convertor.setup( encoding ); + header = create_header( file_class, encoding ); + create_mandatory_sections(); + } + + //------------------------------------------------------------------------------ + bool load( const std::string& file_name ) + { + std::ifstream stream; + stream.open( file_name.c_str(), std::ios::in | std::ios::binary ); + if ( !stream ) { + return false; + } + + return load( stream ); + } + + //------------------------------------------------------------------------------ + bool load( std::istream& stream ) + { + clean(); + + unsigned char e_ident[EI_NIDENT]; + // Read ELF file signature + stream.read( reinterpret_cast( &e_ident ), sizeof( e_ident ) ); + + // Is it ELF file? + if ( stream.gcount() != sizeof( e_ident ) || + e_ident[EI_MAG0] != ELFMAG0 || e_ident[EI_MAG1] != ELFMAG1 || + e_ident[EI_MAG2] != ELFMAG2 || e_ident[EI_MAG3] != ELFMAG3 ) { + return false; + } + + if ( ( e_ident[EI_CLASS] != ELFCLASS64 ) && + ( e_ident[EI_CLASS] != ELFCLASS32 ) ) { + return false; + } + + convertor.setup( e_ident[EI_DATA] ); + header = create_header( e_ident[EI_CLASS], e_ident[EI_DATA] ); + if ( 0 == header ) { + return false; + } + if ( !header->load( stream ) ) { + return false; + } + + load_sections( stream ); + bool is_still_good = load_segments( stream ); + return is_still_good; + } + + //------------------------------------------------------------------------------ + bool save( const std::string& file_name ) + { + std::ofstream stream; + stream.open( file_name.c_str(), std::ios::out | std::ios::binary ); + if ( !stream ) { + return false; + } + + return save( stream ); + } + + //------------------------------------------------------------------------------ + bool save( std::ostream& stream ) + { + if ( !stream || !header ) { + return false; + } + + bool is_still_good = true; + // Define layout specific header fields + // The position of the segment table is fixed after the header. + // The position of the section table is variable and needs to be fixed + // before saving. + header->set_segments_num( segments.size() ); + header->set_segments_offset( segments.size() ? header->get_header_size() + : 0 ); + header->set_sections_num( sections.size() ); + header->set_sections_offset( 0 ); + + // Layout the first section right after the segment table + current_file_pos = header->get_header_size() + + header->get_segment_entry_size() * + (Elf_Xword)header->get_segments_num(); + + calc_segment_alignment(); + + is_still_good = layout_segments_and_their_sections(); + is_still_good = is_still_good && layout_sections_without_segments(); + is_still_good = is_still_good && layout_section_table(); + + is_still_good = is_still_good && save_header( stream ); + is_still_good = is_still_good && save_sections( stream ); + is_still_good = is_still_good && save_segments( stream ); + + return is_still_good; + } + + //------------------------------------------------------------------------------ + // ELF header access functions + ELFIO_HEADER_ACCESS_GET( unsigned char, class ); + ELFIO_HEADER_ACCESS_GET( unsigned char, elf_version ); + ELFIO_HEADER_ACCESS_GET( unsigned char, encoding ); + ELFIO_HEADER_ACCESS_GET( Elf_Word, version ); + ELFIO_HEADER_ACCESS_GET( Elf_Half, header_size ); + ELFIO_HEADER_ACCESS_GET( Elf_Half, section_entry_size ); + ELFIO_HEADER_ACCESS_GET( Elf_Half, segment_entry_size ); + + ELFIO_HEADER_ACCESS_GET_SET( unsigned char, os_abi ); + ELFIO_HEADER_ACCESS_GET_SET( unsigned char, abi_version ); + ELFIO_HEADER_ACCESS_GET_SET( Elf_Half, type ); + ELFIO_HEADER_ACCESS_GET_SET( Elf_Half, machine ); + ELFIO_HEADER_ACCESS_GET_SET( Elf_Word, flags ); + ELFIO_HEADER_ACCESS_GET_SET( Elf64_Addr, entry ); + ELFIO_HEADER_ACCESS_GET_SET( Elf64_Off, sections_offset ); + ELFIO_HEADER_ACCESS_GET_SET( Elf64_Off, segments_offset ); + ELFIO_HEADER_ACCESS_GET_SET( Elf_Half, section_name_str_index ); + + //------------------------------------------------------------------------------ + const endianess_convertor& get_convertor() const { return convertor; } + + //------------------------------------------------------------------------------ + Elf_Xword get_default_entry_size( Elf_Word section_type ) const + { + switch ( section_type ) { + case SHT_RELA: + if ( header->get_class() == ELFCLASS64 ) { + return sizeof( Elf64_Rela ); + } + else { + return sizeof( Elf32_Rela ); + } + case SHT_REL: + if ( header->get_class() == ELFCLASS64 ) { + return sizeof( Elf64_Rel ); + } + else { + return sizeof( Elf32_Rel ); + } + case SHT_SYMTAB: + if ( header->get_class() == ELFCLASS64 ) { + return sizeof( Elf64_Sym ); + } + else { + return sizeof( Elf32_Sym ); + } + case SHT_DYNAMIC: + if ( header->get_class() == ELFCLASS64 ) { + return sizeof( Elf64_Dyn ); + } + else { + return sizeof( Elf32_Dyn ); + } + default: + return 0; + } + } + + //------------------------------------------------------------------------------ + private: + bool is_offset_in_section( Elf64_Off offset, const section* sec ) const + { + return ( offset >= sec->get_offset() ) && + ( offset < ( sec->get_offset() + sec->get_size() ) ); + } + + //------------------------------------------------------------------------------ + public: + //! returns an empty string if no problems are detected, + //! or a string containing an error message if problems are found + std::string validate() const + { + + // check for overlapping sections in the file + for ( int i = 0; i < sections.size(); ++i ) { + for ( int j = i + 1; j < sections.size(); ++j ) { + const section* a = sections[i]; + const section* b = sections[j]; + if ( !( a->get_type() & SHT_NOBITS ) && + !( b->get_type() & SHT_NOBITS ) && ( a->get_size() > 0 ) && + ( b->get_size() > 0 ) && ( a->get_offset() > 0 ) && + ( b->get_offset() > 0 ) ) { + if ( is_offset_in_section( a->get_offset(), b ) || + is_offset_in_section( + a->get_offset() + a->get_size() - 1, b ) || + is_offset_in_section( b->get_offset(), a ) || + is_offset_in_section( + b->get_offset() + b->get_size() - 1, a ) ) { + return "Sections " + a->get_name() + " and " + + b->get_name() + " overlap in file"; + } + } + } + } + + // more checks to be added here... + + return ""; + } + + //------------------------------------------------------------------------------ + private: + //------------------------------------------------------------------------------ + void clean() + { + delete header; + header = 0; + + std::vector::const_iterator it; + for ( it = sections_.begin(); it != sections_.end(); ++it ) { + delete *it; + } + sections_.clear(); + + std::vector::const_iterator it1; + for ( it1 = segments_.begin(); it1 != segments_.end(); ++it1 ) { + delete *it1; + } + segments_.clear(); + } + + //------------------------------------------------------------------------------ + elf_header* create_header( unsigned char file_class, + unsigned char encoding ) + { + elf_header* new_header = 0; + + if ( file_class == ELFCLASS64 ) { + new_header = + new elf_header_impl( &convertor, encoding ); + } + else if ( file_class == ELFCLASS32 ) { + new_header = + new elf_header_impl( &convertor, encoding ); + } + else { + return 0; + } + + return new_header; + } + + //------------------------------------------------------------------------------ + section* create_section() + { + section* new_section; + unsigned char file_class = get_class(); + + if ( file_class == ELFCLASS64 ) { + new_section = new section_impl( &convertor ); + } + else if ( file_class == ELFCLASS32 ) { + new_section = new section_impl( &convertor ); + } + else { + return 0; + } + + new_section->set_index( (Elf_Half)sections_.size() ); + sections_.push_back( new_section ); + + return new_section; + } + + //------------------------------------------------------------------------------ + segment* create_segment() + { + segment* new_segment; + unsigned char file_class = header->get_class(); + + if ( file_class == ELFCLASS64 ) { + new_segment = new segment_impl( &convertor ); + } + else if ( file_class == ELFCLASS32 ) { + new_segment = new segment_impl( &convertor ); + } + else { + return 0; + } + + new_segment->set_index( (Elf_Half)segments_.size() ); + segments_.push_back( new_segment ); + + return new_segment; + } + + //------------------------------------------------------------------------------ + void create_mandatory_sections() + { + // Create null section without calling to 'add_section' as no string + // section containing section names exists yet + section* sec0 = create_section(); + sec0->set_index( 0 ); + sec0->set_name( "" ); + sec0->set_name_string_offset( 0 ); + + set_section_name_str_index( 1 ); + section* shstrtab = sections.add( ".shstrtab" ); + shstrtab->set_type( SHT_STRTAB ); + shstrtab->set_addr_align( 1 ); + } + + //------------------------------------------------------------------------------ + Elf_Half load_sections( std::istream& stream ) + { + Elf_Half entry_size = header->get_section_entry_size(); + Elf_Half num = header->get_sections_num(); + Elf64_Off offset = header->get_sections_offset(); + + for ( Elf_Half i = 0; i < num; ++i ) { + section* sec = create_section(); + sec->load( stream, (std::streamoff)offset + + (std::streampos)i * entry_size ); + sec->set_index( i ); + // To mark that the section is not permitted to reassign address + // during layout calculation + sec->set_address( sec->get_address() ); + } + + Elf_Half shstrndx = get_section_name_str_index(); + + if ( SHN_UNDEF != shstrndx ) { + string_section_accessor str_reader( sections[shstrndx] ); + for ( Elf_Half i = 0; i < num; ++i ) { + Elf_Word section_offset = sections[i]->get_name_string_offset(); + const char* p = str_reader.get_string( section_offset ); + if ( p != 0 ) { + sections[i]->set_name( p ); + } + } + } + + return num; + } + + //------------------------------------------------------------------------------ + //! Checks whether the addresses of the section entirely fall within the given segment. + //! It doesn't matter if the addresses are memory addresses, or file offsets, + //! they just need to be in the same address space + bool is_sect_in_seg( Elf64_Off sect_begin, + Elf_Xword sect_size, + Elf64_Off seg_begin, + Elf64_Off seg_end ) + { + return ( seg_begin <= sect_begin ) && + ( sect_begin + sect_size <= seg_end ) && + ( sect_begin < + seg_end ); // this is important criteria when sect_size == 0 + // Example: seg_begin=10, seg_end=12 (-> covering the bytes 10 and 11) + // sect_begin=12, sect_size=0 -> shall return false! + } + + //------------------------------------------------------------------------------ + bool load_segments( std::istream& stream ) + { + Elf_Half entry_size = header->get_segment_entry_size(); + Elf_Half num = header->get_segments_num(); + Elf64_Off offset = header->get_segments_offset(); + + for ( Elf_Half i = 0; i < num; ++i ) { + segment* seg; + unsigned char file_class = header->get_class(); + + if ( file_class == ELFCLASS64 ) { + seg = new segment_impl( &convertor ); + } + else if ( file_class == ELFCLASS32 ) { + seg = new segment_impl( &convertor ); + } + else { + return false; + } + + seg->load( stream, (std::streamoff)offset + + (std::streampos)i * entry_size ); + seg->set_index( i ); + + // Add sections to the segments (similar to readelfs algorithm) + Elf64_Off segBaseOffset = seg->get_offset(); + Elf64_Off segEndOffset = segBaseOffset + seg->get_file_size(); + Elf64_Off segVBaseAddr = seg->get_virtual_address(); + Elf64_Off segVEndAddr = segVBaseAddr + seg->get_memory_size(); + for ( Elf_Half j = 0; j < sections.size(); ++j ) { + const section* psec = sections[j]; + + // SHF_ALLOC sections are matched based on the virtual address + // otherwise the file offset is matched + if ( ( psec->get_flags() & SHF_ALLOC ) + ? is_sect_in_seg( psec->get_address(), + psec->get_size(), segVBaseAddr, + segVEndAddr ) + : is_sect_in_seg( psec->get_offset(), psec->get_size(), + segBaseOffset, segEndOffset ) ) { + // Alignment of segment shall not be updated, to preserve original value + // It will be re-calculated on saving. + seg->add_section_index( psec->get_index(), 0 ); + } + } + + // Add section into the segments' container + segments_.push_back( seg ); + } + + return true; + } + + //------------------------------------------------------------------------------ + bool save_header( std::ostream& stream ) { return header->save( stream ); } + + //------------------------------------------------------------------------------ + bool save_sections( std::ostream& stream ) + { + for ( unsigned int i = 0; i < sections_.size(); ++i ) { + section* sec = sections_.at( i ); + + std::streampos headerPosition = + (std::streamoff)header->get_sections_offset() + + (std::streampos)header->get_section_entry_size() * + sec->get_index(); + + sec->save( stream, headerPosition, sec->get_offset() ); + } + return true; + } + + //------------------------------------------------------------------------------ + bool save_segments( std::ostream& stream ) + { + for ( unsigned int i = 0; i < segments_.size(); ++i ) { + segment* seg = segments_.at( i ); + + std::streampos headerPosition = + header->get_segments_offset() + + (std::streampos)header->get_segment_entry_size() * + seg->get_index(); + + seg->save( stream, headerPosition, seg->get_offset() ); + } + return true; + } + + //------------------------------------------------------------------------------ + bool is_section_without_segment( unsigned int section_index ) + { + bool found = false; + + for ( unsigned int j = 0; !found && ( j < segments.size() ); ++j ) { + for ( unsigned int k = 0; + !found && ( k < segments[j]->get_sections_num() ); ++k ) { + found = segments[j]->get_section_index_at( k ) == section_index; + } + } + + return !found; + } + + //------------------------------------------------------------------------------ + bool is_subsequence_of( segment* seg1, segment* seg2 ) + { + // Return 'true' if sections of seg1 are a subset of sections in seg2 + const std::vector& sections1 = seg1->get_sections(); + const std::vector& sections2 = seg2->get_sections(); + + bool found = false; + if ( sections1.size() < sections2.size() ) { + found = std::includes( sections2.begin(), sections2.end(), + sections1.begin(), sections1.end() ); + } + + return found; + } + + //------------------------------------------------------------------------------ + std::vector get_ordered_segments() + { + std::vector res; + std::deque worklist; + + res.reserve( segments.size() ); + std::copy( segments_.begin(), segments_.end(), + std::back_inserter( worklist ) ); + + // Bring the segments which start at address 0 to the front + size_t nextSlot = 0; + for ( size_t i = 0; i < worklist.size(); ++i ) { + if ( i != nextSlot && worklist[i]->is_offset_initialized() && + worklist[i]->get_offset() == 0 ) { + if ( worklist[nextSlot]->get_offset() == 0 ) { + ++nextSlot; + } + std::swap( worklist[i], worklist[nextSlot] ); + ++nextSlot; + } + } + + while ( !worklist.empty() ) { + segment* seg = worklist.front(); + worklist.pop_front(); + + size_t i = 0; + for ( ; i < worklist.size(); ++i ) { + if ( is_subsequence_of( seg, worklist[i] ) ) { + break; + } + } + + if ( i < worklist.size() ) + worklist.push_back( seg ); + else + res.push_back( seg ); + } + + return res; + } + + //------------------------------------------------------------------------------ + bool layout_sections_without_segments() + { + for ( unsigned int i = 0; i < sections_.size(); ++i ) { + if ( is_section_without_segment( i ) ) { + section* sec = sections_[i]; + + Elf_Xword section_align = sec->get_addr_align(); + if ( section_align > 1 && + current_file_pos % section_align != 0 ) { + current_file_pos += + section_align - current_file_pos % section_align; + } + + if ( 0 != sec->get_index() ) + sec->set_offset( current_file_pos ); + + if ( SHT_NOBITS != sec->get_type() && + SHT_NULL != sec->get_type() ) { + current_file_pos += sec->get_size(); + } + } + } + + return true; + } + + //------------------------------------------------------------------------------ + void calc_segment_alignment() + { + for ( std::vector::iterator s = segments_.begin(); + s != segments_.end(); ++s ) { + segment* seg = *s; + for ( int i = 0; i < seg->get_sections_num(); ++i ) { + section* sect = sections_[seg->get_section_index_at( i )]; + if ( sect->get_addr_align() > seg->get_align() ) { + seg->set_align( sect->get_addr_align() ); + } + } + } + } + + //------------------------------------------------------------------------------ + bool layout_segments_and_their_sections() + { + std::vector worklist; + std::vector section_generated( sections.size(), false ); + + // Get segments in a order in where segments which contain a + // sub sequence of other segments are located at the end + worklist = get_ordered_segments(); + + for ( unsigned int i = 0; i < worklist.size(); ++i ) { + Elf_Xword segment_memory = 0; + Elf_Xword segment_filesize = 0; + Elf_Xword seg_start_pos = current_file_pos; + segment* seg = worklist[i]; + + // Special case: PHDR segment + // This segment contains the program headers but no sections + if ( seg->get_type() == PT_PHDR && seg->get_sections_num() == 0 ) { + seg_start_pos = header->get_segments_offset(); + segment_memory = segment_filesize = + header->get_segment_entry_size() * + (Elf_Xword)header->get_segments_num(); + } + // Special case: + else if ( seg->is_offset_initialized() && seg->get_offset() == 0 ) { + seg_start_pos = 0; + if ( seg->get_sections_num() ) { + segment_memory = segment_filesize = current_file_pos; + } + } + // New segments with not generated sections + // have to be aligned + else if ( seg->get_sections_num() && + !section_generated[seg->get_section_index_at( 0 )] ) { + Elf_Xword align = seg->get_align() > 0 ? seg->get_align() : 1; + Elf64_Off cur_page_alignment = current_file_pos % align; + Elf64_Off req_page_alignment = + seg->get_virtual_address() % align; + Elf64_Off error = req_page_alignment - cur_page_alignment; + + current_file_pos += ( seg->get_align() + error ) % align; + seg_start_pos = current_file_pos; + } + else if ( seg->get_sections_num() ) { + seg_start_pos = + sections[seg->get_section_index_at( 0 )]->get_offset(); + } + + // Write segment's data + for ( unsigned int j = 0; j < seg->get_sections_num(); ++j ) { + Elf_Half index = seg->get_section_index_at( j ); + + section* sec = sections[index]; + + // The NULL section is always generated + if ( SHT_NULL == sec->get_type() ) { + section_generated[index] = true; + continue; + } + + Elf_Xword secAlign = 0; + // Fix up the alignment + if ( !section_generated[index] && + sec->is_address_initialized() && + SHT_NOBITS != sec->get_type() && + SHT_NULL != sec->get_type() && 0 != sec->get_size() ) { + // Align the sections based on the virtual addresses + // when possible (this is what matters for execution) + Elf64_Off req_offset = + sec->get_address() - seg->get_virtual_address(); + Elf64_Off cur_offset = current_file_pos - seg_start_pos; + if ( req_offset < cur_offset ) { + // something has gone awfully wrong, abort! + // secAlign would turn out negative, seeking backwards and overwriting previous data + return false; + } + secAlign = req_offset - cur_offset; + } + else if ( !section_generated[index] && + !sec->is_address_initialized() ) { + // If no address has been specified then only the section + // alignment constraint has to be matched + Elf_Xword align = sec->get_addr_align(); + if ( align == 0 ) { + align = 1; + } + Elf64_Off error = current_file_pos % align; + secAlign = ( align - error ) % align; + } + else if ( section_generated[index] ) { + // Alignment for already generated sections + secAlign = + sec->get_offset() - seg_start_pos - segment_filesize; + } + + // Determine the segment file and memory sizes + // Special case .tbss section (NOBITS) in non TLS segment + if ( ( sec->get_flags() & SHF_ALLOC ) && + !( ( sec->get_flags() & SHF_TLS ) && + ( seg->get_type() != PT_TLS ) && + ( SHT_NOBITS == sec->get_type() ) ) ) + segment_memory += sec->get_size() + secAlign; + + if ( SHT_NOBITS != sec->get_type() ) + segment_filesize += sec->get_size() + secAlign; + + // Nothing to be done when generating nested segments + if ( section_generated[index] ) { + continue; + } + + current_file_pos += secAlign; + + // Set the section addresses when missing + if ( !sec->is_address_initialized() ) + sec->set_address( seg->get_virtual_address() + + current_file_pos - seg_start_pos ); + + if ( 0 != sec->get_index() ) + sec->set_offset( current_file_pos ); + + if ( SHT_NOBITS != sec->get_type() ) + current_file_pos += sec->get_size(); + + section_generated[index] = true; + } + + seg->set_file_size( segment_filesize ); + + // If we already have a memory size from loading an elf file (value > 0), + // it must not shrink! + // Memory size may be bigger than file size and it is the loader's job to do something + // with the surplus bytes in memory, like initializing them with a defined value. + if ( seg->get_memory_size() < segment_memory ) { + seg->set_memory_size( segment_memory ); + } + + seg->set_offset( seg_start_pos ); + } + + return true; + } + + //------------------------------------------------------------------------------ + bool layout_section_table() + { + // Simply place the section table at the end for now + Elf64_Off alignmentError = current_file_pos % 4; + current_file_pos += ( 4 - alignmentError ) % 4; + header->set_sections_offset( current_file_pos ); + return true; + } + + //------------------------------------------------------------------------------ + public: + friend class Sections; + class Sections + { + public: + //------------------------------------------------------------------------------ + Sections( elfio* parent_ ) : parent( parent_ ) {} + + //------------------------------------------------------------------------------ + Elf_Half size() const { return (Elf_Half)parent->sections_.size(); } + + //------------------------------------------------------------------------------ + section* operator[]( unsigned int index ) const + { + section* sec = 0; + + if ( index < parent->sections_.size() ) { + sec = parent->sections_[index]; + } + + return sec; + } + + //------------------------------------------------------------------------------ + section* operator[]( const std::string& name ) const + { + section* sec = 0; + + std::vector::const_iterator it; + for ( it = parent->sections_.begin(); it != parent->sections_.end(); + ++it ) { + if ( ( *it )->get_name() == name ) { + sec = *it; + break; + } + } + + return sec; + } + + //------------------------------------------------------------------------------ + section* add( const std::string& name ) + { + section* new_section = parent->create_section(); + new_section->set_name( name ); + + Elf_Half str_index = parent->get_section_name_str_index(); + section* string_table( parent->sections_[str_index] ); + string_section_accessor str_writer( string_table ); + Elf_Word pos = str_writer.add_string( name ); + new_section->set_name_string_offset( pos ); + + return new_section; + } + + //------------------------------------------------------------------------------ + std::vector::iterator begin() + { + return parent->sections_.begin(); + } + + //------------------------------------------------------------------------------ + std::vector::iterator end() + { + return parent->sections_.end(); + } + + //------------------------------------------------------------------------------ + std::vector::const_iterator begin() const + { + return parent->sections_.cbegin(); + } + + //------------------------------------------------------------------------------ + std::vector::const_iterator end() const + { + return parent->sections_.cend(); + } + + //------------------------------------------------------------------------------ + private: + elfio* parent; + } sections; + + //------------------------------------------------------------------------------ + public: + friend class Segments; + class Segments + { + public: + //------------------------------------------------------------------------------ + Segments( elfio* parent_ ) : parent( parent_ ) {} + + //------------------------------------------------------------------------------ + Elf_Half size() const { return (Elf_Half)parent->segments_.size(); } + + //------------------------------------------------------------------------------ + segment* operator[]( unsigned int index ) const + { + return parent->segments_[index]; + } + + //------------------------------------------------------------------------------ + segment* add() { return parent->create_segment(); } + + //------------------------------------------------------------------------------ + std::vector::iterator begin() + { + return parent->segments_.begin(); + } + + //------------------------------------------------------------------------------ + std::vector::iterator end() + { + return parent->segments_.end(); + } + + //------------------------------------------------------------------------------ + std::vector::const_iterator begin() const + { + return parent->segments_.cbegin(); + } + + //------------------------------------------------------------------------------ + std::vector::const_iterator end() const + { + return parent->segments_.cend(); + } + + //------------------------------------------------------------------------------ + private: + elfio* parent; + } segments; + + //------------------------------------------------------------------------------ + private: + elf_header* header; + std::vector sections_; + std::vector segments_; + endianess_convertor convertor; + + Elf_Xword current_file_pos; +}; + +} // namespace ELFIO + + +/*** Start of inlined file: elfio_symbols.hpp ***/ +#ifndef ELFIO_SYMBOLS_HPP +#define ELFIO_SYMBOLS_HPP + +namespace ELFIO { + +//------------------------------------------------------------------------------ +template class symbol_section_accessor_template +{ + public: + //------------------------------------------------------------------------------ + symbol_section_accessor_template( const elfio& elf_file_, + S* symbol_section_ ) + : elf_file( elf_file_ ), symbol_section( symbol_section_ ) + { + find_hash_section(); + } + + //------------------------------------------------------------------------------ + Elf_Xword get_symbols_num() const + { + Elf_Xword nRet = 0; + if ( 0 != symbol_section->get_entry_size() ) { + nRet = + symbol_section->get_size() / symbol_section->get_entry_size(); + } + + return nRet; + } + + //------------------------------------------------------------------------------ + bool get_symbol( Elf_Xword index, + std::string& name, + Elf64_Addr& value, + Elf_Xword& size, + unsigned char& bind, + unsigned char& type, + Elf_Half& section_index, + unsigned char& other ) const + { + bool ret = false; + + if ( elf_file.get_class() == ELFCLASS32 ) { + ret = generic_get_symbol( index, name, value, size, bind, + type, section_index, other ); + } + else { + ret = generic_get_symbol( index, name, value, size, bind, + type, section_index, other ); + } + + return ret; + } + + //------------------------------------------------------------------------------ + bool get_symbol( const std::string& name, + Elf64_Addr& value, + Elf_Xword& size, + unsigned char& bind, + unsigned char& type, + Elf_Half& section_index, + unsigned char& other ) const + { + bool ret = false; + + if ( 0 != get_hash_table_index() ) { + Elf_Word nbucket = *(const Elf_Word*)hash_section->get_data(); + Elf_Word nchain = *(const Elf_Word*)( hash_section->get_data() + + sizeof( Elf_Word ) ); + Elf_Word val = elf_hash( (const unsigned char*)name.c_str() ); + Elf_Word y = *(const Elf_Word*)( hash_section->get_data() + + ( 2 + val % nbucket ) * + sizeof( Elf_Word ) ); + std::string str; + get_symbol( y, str, value, size, bind, type, section_index, other ); + while ( str != name && STN_UNDEF != y && y < nchain ) { + y = *(const Elf_Word*)( hash_section->get_data() + + ( 2 + nbucket + y ) * + sizeof( Elf_Word ) ); + get_symbol( y, str, value, size, bind, type, section_index, + other ); + } + if ( str == name ) { + ret = true; + } + } + else { + for ( Elf_Xword i = 0; i < get_symbols_num() && !ret; i++ ) { + std::string symbol_name; + if ( get_symbol( i, symbol_name, value, size, bind, type, + section_index, other ) ) { + if ( symbol_name == name ) { + ret = true; + } + } + } + } + + return ret; + } + + //------------------------------------------------------------------------------ + bool get_symbol( const Elf64_Addr& value, + std::string& name, + Elf_Xword& size, + unsigned char& bind, + unsigned char& type, + Elf_Half& section_index, + unsigned char& other ) const + { + + const endianess_convertor& convertor = elf_file.get_convertor(); + + Elf_Xword idx = 0; + bool match = false; + Elf64_Addr v = 0; + + if ( elf_file.get_class() == ELFCLASS32 ) { + match = generic_search_symbols( + [&]( const Elf32_Sym* sym ) { + return convertor( sym->st_value ) == value; + }, + idx ); + } + else { + match = generic_search_symbols( + [&]( const Elf64_Sym* sym ) { + return convertor( sym->st_value ) == value; + }, + idx ); + } + + if ( match ) { + return get_symbol( idx, name, v, size, bind, type, section_index, + other ); + } + + return false; + } + + //------------------------------------------------------------------------------ + Elf_Word add_symbol( Elf_Word name, + Elf64_Addr value, + Elf_Xword size, + unsigned char info, + unsigned char other, + Elf_Half shndx ) + { + Elf_Word nRet; + + if ( symbol_section->get_size() == 0 ) { + if ( elf_file.get_class() == ELFCLASS32 ) { + nRet = generic_add_symbol( 0, 0, 0, 0, 0, 0 ); + } + else { + nRet = generic_add_symbol( 0, 0, 0, 0, 0, 0 ); + } + } + + if ( elf_file.get_class() == ELFCLASS32 ) { + nRet = generic_add_symbol( name, value, size, info, + other, shndx ); + } + else { + nRet = generic_add_symbol( name, value, size, info, + other, shndx ); + } + + return nRet; + } + + //------------------------------------------------------------------------------ + Elf_Word add_symbol( Elf_Word name, + Elf64_Addr value, + Elf_Xword size, + unsigned char bind, + unsigned char type, + unsigned char other, + Elf_Half shndx ) + { + return add_symbol( name, value, size, ELF_ST_INFO( bind, type ), other, + shndx ); + } + + //------------------------------------------------------------------------------ + Elf_Word add_symbol( string_section_accessor& pStrWriter, + const char* str, + Elf64_Addr value, + Elf_Xword size, + unsigned char info, + unsigned char other, + Elf_Half shndx ) + { + Elf_Word index = pStrWriter.add_string( str ); + return add_symbol( index, value, size, info, other, shndx ); + } + + //------------------------------------------------------------------------------ + Elf_Word add_symbol( string_section_accessor& pStrWriter, + const char* str, + Elf64_Addr value, + Elf_Xword size, + unsigned char bind, + unsigned char type, + unsigned char other, + Elf_Half shndx ) + { + return add_symbol( pStrWriter, str, value, size, + ELF_ST_INFO( bind, type ), other, shndx ); + } + + //------------------------------------------------------------------------------ + Elf_Xword arrange_local_symbols( + std::function func = + nullptr ) + { + int nRet = 0; + + if ( elf_file.get_class() == ELFCLASS32 ) { + nRet = generic_arrange_local_symbols( func ); + } + else { + nRet = generic_arrange_local_symbols( func ); + } + + return nRet; + } + + //------------------------------------------------------------------------------ + private: + //------------------------------------------------------------------------------ + void find_hash_section() + { + hash_section = 0; + hash_section_index = 0; + Elf_Half nSecNo = elf_file.sections.size(); + for ( Elf_Half i = 0; i < nSecNo && 0 == hash_section_index; ++i ) { + const section* sec = elf_file.sections[i]; + if ( sec->get_link() == symbol_section->get_index() ) { + hash_section = sec; + hash_section_index = i; + } + } + } + + //------------------------------------------------------------------------------ + Elf_Half get_string_table_index() const + { + return (Elf_Half)symbol_section->get_link(); + } + + //------------------------------------------------------------------------------ + Elf_Half get_hash_table_index() const { return hash_section_index; } + + //------------------------------------------------------------------------------ + template const T* generic_get_symbol_ptr( Elf_Xword index ) const + { + if ( 0 != symbol_section->get_data() && index < get_symbols_num() ) { + const T* pSym = reinterpret_cast( + symbol_section->get_data() + + index * symbol_section->get_entry_size() ); + + return pSym; + } + + return nullptr; + } + + //------------------------------------------------------------------------------ + template + bool generic_search_symbols( std::function match, + Elf_Xword& idx ) const + { + for ( Elf_Xword i = 0; i < get_symbols_num(); i++ ) { + const T* symPtr = generic_get_symbol_ptr( i ); + + if ( symPtr == nullptr ) + return false; + + if ( match( symPtr ) ) { + idx = i; + return true; + } + } + + return false; + } + + //------------------------------------------------------------------------------ + template + bool generic_get_symbol( Elf_Xword index, + std::string& name, + Elf64_Addr& value, + Elf_Xword& size, + unsigned char& bind, + unsigned char& type, + Elf_Half& section_index, + unsigned char& other ) const + { + bool ret = false; + + if ( 0 != symbol_section->get_data() && index < get_symbols_num() ) { + const T* pSym = reinterpret_cast( + symbol_section->get_data() + + index * symbol_section->get_entry_size() ); + + const endianess_convertor& convertor = elf_file.get_convertor(); + + section* string_section = + elf_file.sections[get_string_table_index()]; + string_section_accessor str_reader( string_section ); + const char* pStr = + str_reader.get_string( convertor( pSym->st_name ) ); + if ( 0 != pStr ) { + name = pStr; + } + value = convertor( pSym->st_value ); + size = convertor( pSym->st_size ); + bind = ELF_ST_BIND( pSym->st_info ); + type = ELF_ST_TYPE( pSym->st_info ); + section_index = convertor( pSym->st_shndx ); + other = pSym->st_other; + + ret = true; + } + + return ret; + } + + //------------------------------------------------------------------------------ + template + Elf_Word generic_add_symbol( Elf_Word name, + Elf64_Addr value, + Elf_Xword size, + unsigned char info, + unsigned char other, + Elf_Half shndx ) + { + const endianess_convertor& convertor = elf_file.get_convertor(); + + T entry; + entry.st_name = convertor( name ); + entry.st_value = value; + entry.st_value = convertor( entry.st_value ); + entry.st_size = size; + entry.st_size = convertor( entry.st_size ); + entry.st_info = convertor( info ); + entry.st_other = convertor( other ); + entry.st_shndx = convertor( shndx ); + + symbol_section->append_data( reinterpret_cast( &entry ), + sizeof( entry ) ); + + Elf_Word nRet = symbol_section->get_size() / sizeof( entry ) - 1; + + return nRet; + } + + //------------------------------------------------------------------------------ + template + Elf_Xword generic_arrange_local_symbols( + std::function func ) + { + const endianess_convertor& convertor = elf_file.get_convertor(); + const Elf_Xword size = symbol_section->get_entry_size(); + + Elf_Xword first_not_local = + 1; // Skip the first entry. It is always NOTYPE + Elf_Xword current = 0; + Elf_Xword count = get_symbols_num(); + + while ( true ) { + T* p1 = nullptr; + T* p2 = nullptr; + + while ( first_not_local < count ) { + p1 = const_cast( + generic_get_symbol_ptr( first_not_local ) ); + if ( ELF_ST_BIND( convertor( p1->st_info ) ) != STB_LOCAL ) + break; + ++first_not_local; + } + + current = first_not_local + 1; + while ( current < count ) { + p2 = const_cast( generic_get_symbol_ptr( current ) ); + if ( ELF_ST_BIND( convertor( p2->st_info ) ) == STB_LOCAL ) + break; + ++current; + } + + if ( first_not_local < count && current < count ) { + if ( func ) + func( first_not_local, current ); + + // Swap the symbols + T tmp; + std::copy( p1, p1 + 1, &tmp ); + std::copy( p2, p2 + 1, p1 ); + std::copy( &tmp, &tmp + 1, p2 ); + } + else { + // Update 'info' field of the section + symbol_section->set_info( first_not_local ); + break; + } + } + + // Elf_Word nRet = symbol_section->get_size() / sizeof(entry) - 1; + + return first_not_local; + } + + //------------------------------------------------------------------------------ + private: + const elfio& elf_file; + S* symbol_section; + Elf_Half hash_section_index; + const section* hash_section; +}; + +using symbol_section_accessor = symbol_section_accessor_template
; +using const_symbol_section_accessor = + symbol_section_accessor_template; + +} // namespace ELFIO + +#endif // ELFIO_SYMBOLS_HPP + +/*** End of inlined file: elfio_symbols.hpp ***/ + + +/*** Start of inlined file: elfio_note.hpp ***/ +#ifndef ELFIO_NOTE_HPP +#define ELFIO_NOTE_HPP + +namespace ELFIO { + +//------------------------------------------------------------------------------ +// There are discrepancies in documentations. SCO documentation +// (http://www.sco.com/developers/gabi/latest/ch5.pheader.html#note_section) +// requires 8 byte entries alignment for 64-bit ELF file, +// but Oracle's definition uses the same structure +// for 32-bit and 64-bit formats. +// (https://docs.oracle.com/cd/E23824_01/html/819-0690/chapter6-18048.html) +// +// It looks like EM_X86_64 Linux implementation is similar to Oracle's +// definition. Therefore, the same alignment works for both formats +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +template class note_section_accessor_template +{ + public: + //------------------------------------------------------------------------------ + note_section_accessor_template( const elfio& elf_file_, S* section_ ) + : elf_file( elf_file_ ), note_section( section_ ) + { + process_section(); + } + + //------------------------------------------------------------------------------ + Elf_Word get_notes_num() const + { + return (Elf_Word)note_start_positions.size(); + } + + //------------------------------------------------------------------------------ + bool get_note( Elf_Word index, + Elf_Word& type, + std::string& name, + void*& desc, + Elf_Word& descSize ) const + { + if ( index >= note_section->get_size() ) { + return false; + } + + const char* pData = + note_section->get_data() + note_start_positions[index]; + int align = sizeof( Elf_Word ); + + const endianess_convertor& convertor = elf_file.get_convertor(); + type = convertor( *(const Elf_Word*)( pData + 2 * align ) ); + Elf_Word namesz = convertor( *(const Elf_Word*)( pData ) ); + descSize = convertor( *(const Elf_Word*)( pData + sizeof( namesz ) ) ); + + Elf_Xword max_name_size = + note_section->get_size() - note_start_positions[index]; + if ( namesz < 1 || namesz > max_name_size || + (Elf_Xword)namesz + descSize > max_name_size ) { + return false; + } + name.assign( pData + 3 * align, namesz - 1 ); + if ( 0 == descSize ) { + desc = 0; + } + else { + desc = + const_cast( pData + 3 * align + + ( ( namesz + align - 1 ) / align ) * align ); + } + + return true; + } + + //------------------------------------------------------------------------------ + void add_note( Elf_Word type, + const std::string& name, + const void* desc, + Elf_Word descSize ) + { + const endianess_convertor& convertor = elf_file.get_convertor(); + + int align = sizeof( Elf_Word ); + Elf_Word nameLen = (Elf_Word)name.size() + 1; + Elf_Word nameLenConv = convertor( nameLen ); + std::string buffer( reinterpret_cast( &nameLenConv ), align ); + Elf_Word descSizeConv = convertor( descSize ); + + buffer.append( reinterpret_cast( &descSizeConv ), align ); + type = convertor( type ); + buffer.append( reinterpret_cast( &type ), align ); + buffer.append( name ); + buffer.append( 1, '\x00' ); + const char pad[] = { '\0', '\0', '\0', '\0' }; + if ( nameLen % align != 0 ) { + buffer.append( pad, align - nameLen % align ); + } + if ( desc != 0 && descSize != 0 ) { + buffer.append( reinterpret_cast( desc ), descSize ); + if ( descSize % align != 0 ) { + buffer.append( pad, align - descSize % align ); + } + } + + note_start_positions.push_back( note_section->get_size() ); + note_section->append_data( buffer ); + } + + private: + //------------------------------------------------------------------------------ + void process_section() + { + const endianess_convertor& convertor = elf_file.get_convertor(); + const char* data = note_section->get_data(); + Elf_Xword size = note_section->get_size(); + Elf_Xword current = 0; + + note_start_positions.clear(); + + // Is it empty? + if ( 0 == data || 0 == size ) { + return; + } + + Elf_Word align = sizeof( Elf_Word ); + while ( current + (Elf_Xword)3 * align <= size ) { + note_start_positions.push_back( current ); + Elf_Word namesz = convertor( *(const Elf_Word*)( data + current ) ); + Elf_Word descsz = convertor( + *(const Elf_Word*)( data + current + sizeof( namesz ) ) ); + + current += (Elf_Xword)3 * sizeof( Elf_Word ) + + ( ( namesz + align - 1 ) / align ) * (Elf_Xword)align + + ( ( descsz + align - 1 ) / align ) * (Elf_Xword)align; + } + } + + //------------------------------------------------------------------------------ + private: + const elfio& elf_file; + S* note_section; + std::vector note_start_positions; +}; + +using note_section_accessor = note_section_accessor_template
; +using const_note_section_accessor = + note_section_accessor_template; + +} // namespace ELFIO + +#endif // ELFIO_NOTE_HPP + +/*** End of inlined file: elfio_note.hpp ***/ + + +/*** Start of inlined file: elfio_relocation.hpp ***/ +#ifndef ELFIO_RELOCATION_HPP +#define ELFIO_RELOCATION_HPP + +namespace ELFIO { + +template struct get_sym_and_type; +template <> struct get_sym_and_type +{ + static int get_r_sym( Elf_Xword info ) + { + return ELF32_R_SYM( (Elf_Word)info ); + } + static int get_r_type( Elf_Xword info ) + { + return ELF32_R_TYPE( (Elf_Word)info ); + } +}; +template <> struct get_sym_and_type +{ + static int get_r_sym( Elf_Xword info ) + { + return ELF32_R_SYM( (Elf_Word)info ); + } + static int get_r_type( Elf_Xword info ) + { + return ELF32_R_TYPE( (Elf_Word)info ); + } +}; +template <> struct get_sym_and_type +{ + static int get_r_sym( Elf_Xword info ) { return ELF64_R_SYM( info ); } + static int get_r_type( Elf_Xword info ) { return ELF64_R_TYPE( info ); } +}; +template <> struct get_sym_and_type +{ + static int get_r_sym( Elf_Xword info ) { return ELF64_R_SYM( info ); } + static int get_r_type( Elf_Xword info ) { return ELF64_R_TYPE( info ); } +}; + +//------------------------------------------------------------------------------ +template class relocation_section_accessor_template +{ + public: + //------------------------------------------------------------------------------ + relocation_section_accessor_template( const elfio& elf_file_, S* section_ ) + : elf_file( elf_file_ ), relocation_section( section_ ) + { + } + + //------------------------------------------------------------------------------ + Elf_Xword get_entries_num() const + { + Elf_Xword nRet = 0; + + if ( 0 != relocation_section->get_entry_size() ) { + nRet = relocation_section->get_size() / + relocation_section->get_entry_size(); + } + + return nRet; + } + + //------------------------------------------------------------------------------ + bool get_entry( Elf_Xword index, + Elf64_Addr& offset, + Elf_Word& symbol, + Elf_Word& type, + Elf_Sxword& addend ) const + { + if ( index >= get_entries_num() ) { // Is index valid + return false; + } + + if ( elf_file.get_class() == ELFCLASS32 ) { + if ( SHT_REL == relocation_section->get_type() ) { + generic_get_entry_rel( index, offset, symbol, type, + addend ); + } + else if ( SHT_RELA == relocation_section->get_type() ) { + generic_get_entry_rela( index, offset, symbol, type, + addend ); + } + } + else { + if ( SHT_REL == relocation_section->get_type() ) { + generic_get_entry_rel( index, offset, symbol, type, + addend ); + } + else if ( SHT_RELA == relocation_section->get_type() ) { + generic_get_entry_rela( index, offset, symbol, type, + addend ); + } + } + + return true; + } + + //------------------------------------------------------------------------------ + bool get_entry( Elf_Xword index, + Elf64_Addr& offset, + Elf64_Addr& symbolValue, + std::string& symbolName, + Elf_Word& type, + Elf_Sxword& addend, + Elf_Sxword& calcValue ) const + { + // Do regular job + Elf_Word symbol; + bool ret = get_entry( index, offset, symbol, type, addend ); + + // Find the symbol + Elf_Xword size; + unsigned char bind; + unsigned char symbolType; + Elf_Half section; + unsigned char other; + + symbol_section_accessor symbols( + elf_file, elf_file.sections[get_symbol_table_index()] ); + ret = ret && symbols.get_symbol( symbol, symbolName, symbolValue, size, + bind, symbolType, section, other ); + + if ( ret ) { // Was it successful? + switch ( type ) { + case R_386_NONE: // none + calcValue = 0; + break; + case R_386_32: // S + A + calcValue = symbolValue + addend; + break; + case R_386_PC32: // S + A - P + calcValue = symbolValue + addend - offset; + break; + case R_386_GOT32: // G + A - P + calcValue = 0; + break; + case R_386_PLT32: // L + A - P + calcValue = 0; + break; + case R_386_COPY: // none + calcValue = 0; + break; + case R_386_GLOB_DAT: // S + case R_386_JMP_SLOT: // S + calcValue = symbolValue; + break; + case R_386_RELATIVE: // B + A + calcValue = addend; + break; + case R_386_GOTOFF: // S + A - GOT + calcValue = 0; + break; + case R_386_GOTPC: // GOT + A - P + calcValue = 0; + break; + default: // Not recognized symbol! + calcValue = 0; + break; + } + } + + return ret; + } + + //------------------------------------------------------------------------------ + bool set_entry( Elf_Xword index, + Elf64_Addr offset, + Elf_Word symbol, + Elf_Word type, + Elf_Sxword addend ) + { + if ( index >= get_entries_num() ) { // Is index valid + return false; + } + + if ( elf_file.get_class() == ELFCLASS32 ) { + if ( SHT_REL == relocation_section->get_type() ) { + generic_set_entry_rel( index, offset, symbol, type, + addend ); + } + else if ( SHT_RELA == relocation_section->get_type() ) { + generic_set_entry_rela( index, offset, symbol, type, + addend ); + } + } + else { + if ( SHT_REL == relocation_section->get_type() ) { + generic_set_entry_rel( index, offset, symbol, type, + addend ); + } + else if ( SHT_RELA == relocation_section->get_type() ) { + generic_set_entry_rela( index, offset, symbol, type, + addend ); + } + } + + return true; + } + + //------------------------------------------------------------------------------ + void add_entry( Elf64_Addr offset, Elf_Xword info ) + { + if ( elf_file.get_class() == ELFCLASS32 ) { + generic_add_entry( offset, info ); + } + else { + generic_add_entry( offset, info ); + } + } + + //------------------------------------------------------------------------------ + void add_entry( Elf64_Addr offset, Elf_Word symbol, unsigned char type ) + { + Elf_Xword info; + if ( elf_file.get_class() == ELFCLASS32 ) { + info = ELF32_R_INFO( (Elf_Xword)symbol, type ); + } + else { + info = ELF64_R_INFO( (Elf_Xword)symbol, type ); + } + + add_entry( offset, info ); + } + + //------------------------------------------------------------------------------ + void add_entry( Elf64_Addr offset, Elf_Xword info, Elf_Sxword addend ) + { + if ( elf_file.get_class() == ELFCLASS32 ) { + generic_add_entry( offset, info, addend ); + } + else { + generic_add_entry( offset, info, addend ); + } + } + + //------------------------------------------------------------------------------ + void add_entry( Elf64_Addr offset, + Elf_Word symbol, + unsigned char type, + Elf_Sxword addend ) + { + Elf_Xword info; + if ( elf_file.get_class() == ELFCLASS32 ) { + info = ELF32_R_INFO( (Elf_Xword)symbol, type ); + } + else { + info = ELF64_R_INFO( (Elf_Xword)symbol, type ); + } + + add_entry( offset, info, addend ); + } + + //------------------------------------------------------------------------------ + void add_entry( string_section_accessor str_writer, + const char* str, + symbol_section_accessor sym_writer, + Elf64_Addr value, + Elf_Word size, + unsigned char sym_info, + unsigned char other, + Elf_Half shndx, + Elf64_Addr offset, + unsigned char type ) + { + Elf_Word str_index = str_writer.add_string( str ); + Elf_Word sym_index = sym_writer.add_symbol( str_index, value, size, + sym_info, other, shndx ); + add_entry( offset, sym_index, type ); + } + + //------------------------------------------------------------------------------ + void swap_symbols( Elf_Xword first, Elf_Xword second ) + { + Elf64_Addr offset; + Elf_Word symbol; + Elf_Word rtype; + Elf_Sxword addend; + for ( Elf_Word i = 0; i < get_entries_num(); i++ ) { + get_entry( i, offset, symbol, rtype, addend ); + if ( symbol == first ) { + set_entry( i, offset, (Elf_Word)second, rtype, addend ); + } + if ( symbol == second ) { + set_entry( i, offset, (Elf_Word)first, rtype, addend ); + } + } + } + + //------------------------------------------------------------------------------ + private: + //------------------------------------------------------------------------------ + Elf_Half get_symbol_table_index() const + { + return (Elf_Half)relocation_section->get_link(); + } + + //------------------------------------------------------------------------------ + template + void generic_get_entry_rel( Elf_Xword index, + Elf64_Addr& offset, + Elf_Word& symbol, + Elf_Word& type, + Elf_Sxword& addend ) const + { + const endianess_convertor& convertor = elf_file.get_convertor(); + + const T* pEntry = reinterpret_cast( + relocation_section->get_data() + + index * relocation_section->get_entry_size() ); + offset = convertor( pEntry->r_offset ); + Elf_Xword tmp = convertor( pEntry->r_info ); + symbol = get_sym_and_type::get_r_sym( tmp ); + type = get_sym_and_type::get_r_type( tmp ); + addend = 0; + } + + //------------------------------------------------------------------------------ + template + void generic_get_entry_rela( Elf_Xword index, + Elf64_Addr& offset, + Elf_Word& symbol, + Elf_Word& type, + Elf_Sxword& addend ) const + { + const endianess_convertor& convertor = elf_file.get_convertor(); + + const T* pEntry = reinterpret_cast( + relocation_section->get_data() + + index * relocation_section->get_entry_size() ); + offset = convertor( pEntry->r_offset ); + Elf_Xword tmp = convertor( pEntry->r_info ); + symbol = get_sym_and_type::get_r_sym( tmp ); + type = get_sym_and_type::get_r_type( tmp ); + addend = convertor( pEntry->r_addend ); + } + + //------------------------------------------------------------------------------ + template + void generic_set_entry_rel( Elf_Xword index, + Elf64_Addr offset, + Elf_Word symbol, + Elf_Word type, + Elf_Sxword ) + { + const endianess_convertor& convertor = elf_file.get_convertor(); + + T* pEntry = const_cast( reinterpret_cast( + relocation_section->get_data() + + index * relocation_section->get_entry_size() ) ); + + if ( elf_file.get_class() == ELFCLASS32 ) { + pEntry->r_info = ELF32_R_INFO( (Elf_Xword)symbol, type ); + } + else { + pEntry->r_info = ELF64_R_INFO( (Elf_Xword)symbol, type ); + } + pEntry->r_offset = offset; + pEntry->r_offset = convertor( pEntry->r_offset ); + pEntry->r_info = convertor( pEntry->r_info ); + } + + //------------------------------------------------------------------------------ + template + void generic_set_entry_rela( Elf_Xword index, + Elf64_Addr offset, + Elf_Word symbol, + Elf_Word type, + Elf_Sxword addend ) + { + const endianess_convertor& convertor = elf_file.get_convertor(); + + T* pEntry = const_cast( reinterpret_cast( + relocation_section->get_data() + + index * relocation_section->get_entry_size() ) ); + + if ( elf_file.get_class() == ELFCLASS32 ) { + pEntry->r_info = ELF32_R_INFO( (Elf_Xword)symbol, type ); + } + else { + pEntry->r_info = ELF64_R_INFO( (Elf_Xword)symbol, type ); + } + pEntry->r_offset = offset; + pEntry->r_addend = addend; + pEntry->r_offset = convertor( pEntry->r_offset ); + pEntry->r_info = convertor( pEntry->r_info ); + pEntry->r_addend = convertor( pEntry->r_addend ); + } + + //------------------------------------------------------------------------------ + template + void generic_add_entry( Elf64_Addr offset, Elf_Xword info ) + { + const endianess_convertor& convertor = elf_file.get_convertor(); + + T entry; + entry.r_offset = offset; + entry.r_info = info; + entry.r_offset = convertor( entry.r_offset ); + entry.r_info = convertor( entry.r_info ); + + relocation_section->append_data( reinterpret_cast( &entry ), + sizeof( entry ) ); + } + + //------------------------------------------------------------------------------ + template + void + generic_add_entry( Elf64_Addr offset, Elf_Xword info, Elf_Sxword addend ) + { + const endianess_convertor& convertor = elf_file.get_convertor(); + + T entry; + entry.r_offset = offset; + entry.r_info = info; + entry.r_addend = addend; + entry.r_offset = convertor( entry.r_offset ); + entry.r_info = convertor( entry.r_info ); + entry.r_addend = convertor( entry.r_addend ); + + relocation_section->append_data( reinterpret_cast( &entry ), + sizeof( entry ) ); + } + + //------------------------------------------------------------------------------ + private: + const elfio& elf_file; + S* relocation_section; +}; + +using relocation_section_accessor = + relocation_section_accessor_template
; +using const_relocation_section_accessor = + relocation_section_accessor_template; + +} // namespace ELFIO + +#endif // ELFIO_RELOCATION_HPP + +/*** End of inlined file: elfio_relocation.hpp ***/ + + +/*** Start of inlined file: elfio_dynamic.hpp ***/ +#ifndef ELFIO_DYNAMIC_HPP +#define ELFIO_DYNAMIC_HPP + +namespace ELFIO { + +//------------------------------------------------------------------------------ +template class dynamic_section_accessor_template +{ + public: + //------------------------------------------------------------------------------ + dynamic_section_accessor_template( const elfio& elf_file_, S* section_ ) + : elf_file( elf_file_ ), dynamic_section( section_ ) + { + } + + //------------------------------------------------------------------------------ + Elf_Xword get_entries_num() const + { + Elf_Xword nRet = 0; + + if ( 0 != dynamic_section->get_entry_size() ) { + nRet = + dynamic_section->get_size() / dynamic_section->get_entry_size(); + } + + return nRet; + } + + //------------------------------------------------------------------------------ + bool get_entry( Elf_Xword index, + Elf_Xword& tag, + Elf_Xword& value, + std::string& str ) const + { + if ( index >= get_entries_num() ) { // Is index valid + return false; + } + + if ( elf_file.get_class() == ELFCLASS32 ) { + generic_get_entry_dyn( index, tag, value ); + } + else { + generic_get_entry_dyn( index, tag, value ); + } + + // If the tag may have a string table reference, prepare the string + if ( tag == DT_NEEDED || tag == DT_SONAME || tag == DT_RPATH || + tag == DT_RUNPATH ) { + string_section_accessor strsec = + elf_file.sections[get_string_table_index()]; + const char* result = strsec.get_string( value ); + if ( 0 == result ) { + str.clear(); + return false; + } + str = result; + } + else { + str.clear(); + } + + return true; + } + + //------------------------------------------------------------------------------ + void add_entry( Elf_Xword tag, Elf_Xword value ) + { + if ( elf_file.get_class() == ELFCLASS32 ) { + generic_add_entry( tag, value ); + } + else { + generic_add_entry( tag, value ); + } + } + + //------------------------------------------------------------------------------ + void add_entry( Elf_Xword tag, const std::string& str ) + { + string_section_accessor strsec = + elf_file.sections[get_string_table_index()]; + Elf_Xword value = strsec.add_string( str ); + add_entry( tag, value ); + } + + //------------------------------------------------------------------------------ + private: + //------------------------------------------------------------------------------ + Elf_Half get_string_table_index() const + { + return (Elf_Half)dynamic_section->get_link(); + } + + //------------------------------------------------------------------------------ + template + void generic_get_entry_dyn( Elf_Xword index, + Elf_Xword& tag, + Elf_Xword& value ) const + { + const endianess_convertor& convertor = elf_file.get_convertor(); + + // Check unusual case when dynamic section has no data + if ( dynamic_section->get_data() == 0 || + ( index + 1 ) * dynamic_section->get_entry_size() > + dynamic_section->get_size() ) { + tag = DT_NULL; + value = 0; + return; + } + + const T* pEntry = reinterpret_cast( + dynamic_section->get_data() + + index * dynamic_section->get_entry_size() ); + tag = convertor( pEntry->d_tag ); + switch ( tag ) { + case DT_NULL: + case DT_SYMBOLIC: + case DT_TEXTREL: + case DT_BIND_NOW: + value = 0; + break; + case DT_NEEDED: + case DT_PLTRELSZ: + case DT_RELASZ: + case DT_RELAENT: + case DT_STRSZ: + case DT_SYMENT: + case DT_SONAME: + case DT_RPATH: + case DT_RELSZ: + case DT_RELENT: + case DT_PLTREL: + case DT_INIT_ARRAYSZ: + case DT_FINI_ARRAYSZ: + case DT_RUNPATH: + case DT_FLAGS: + case DT_PREINIT_ARRAYSZ: + value = convertor( pEntry->d_un.d_val ); + break; + case DT_PLTGOT: + case DT_HASH: + case DT_STRTAB: + case DT_SYMTAB: + case DT_RELA: + case DT_INIT: + case DT_FINI: + case DT_REL: + case DT_DEBUG: + case DT_JMPREL: + case DT_INIT_ARRAY: + case DT_FINI_ARRAY: + case DT_PREINIT_ARRAY: + default: + value = convertor( pEntry->d_un.d_ptr ); + break; + } + } + + //------------------------------------------------------------------------------ + template void generic_add_entry( Elf_Xword tag, Elf_Xword value ) + { + const endianess_convertor& convertor = elf_file.get_convertor(); + + T entry; + + switch ( tag ) { + case DT_NULL: + case DT_SYMBOLIC: + case DT_TEXTREL: + case DT_BIND_NOW: + value = 0; + case DT_NEEDED: + case DT_PLTRELSZ: + case DT_RELASZ: + case DT_RELAENT: + case DT_STRSZ: + case DT_SYMENT: + case DT_SONAME: + case DT_RPATH: + case DT_RELSZ: + case DT_RELENT: + case DT_PLTREL: + case DT_INIT_ARRAYSZ: + case DT_FINI_ARRAYSZ: + case DT_RUNPATH: + case DT_FLAGS: + case DT_PREINIT_ARRAYSZ: + entry.d_un.d_val = convertor( value ); + break; + case DT_PLTGOT: + case DT_HASH: + case DT_STRTAB: + case DT_SYMTAB: + case DT_RELA: + case DT_INIT: + case DT_FINI: + case DT_REL: + case DT_DEBUG: + case DT_JMPREL: + case DT_INIT_ARRAY: + case DT_FINI_ARRAY: + case DT_PREINIT_ARRAY: + default: + entry.d_un.d_ptr = convertor( value ); + break; + } + + entry.d_tag = convertor( tag ); + + dynamic_section->append_data( reinterpret_cast( &entry ), + sizeof( entry ) ); + } + + //------------------------------------------------------------------------------ + private: + const elfio& elf_file; + S* dynamic_section; +}; + +using dynamic_section_accessor = dynamic_section_accessor_template
; +using const_dynamic_section_accessor = + dynamic_section_accessor_template; + +} // namespace ELFIO + +#endif // ELFIO_DYNAMIC_HPP + +/*** End of inlined file: elfio_dynamic.hpp ***/ + + +/*** Start of inlined file: elfio_modinfo.hpp ***/ +#ifndef ELFIO_MODINFO_HPP +#define ELFIO_MODINFO_HPP + +#include +#include + +namespace ELFIO { + +//------------------------------------------------------------------------------ +template class modinfo_section_accessor_template +{ + public: + //------------------------------------------------------------------------------ + modinfo_section_accessor_template( S* section_ ) + : modinfo_section( section_ ) + { + process_section(); + } + + //------------------------------------------------------------------------------ + Elf_Word get_attribute_num() const { return (Elf_Word)content.size(); } + + //------------------------------------------------------------------------------ + bool + get_attribute( Elf_Word no, std::string& field, std::string& value ) const + { + if ( no < content.size() ) { + field = content[no].first; + value = content[no].second; + return true; + } + + return false; + } + + //------------------------------------------------------------------------------ + bool get_attribute( std::string field_name, std::string& value ) const + { + for ( auto i = content.begin(); i != content.end(); i++ ) { + if ( field_name == i->first ) { + value = i->second; + return true; + } + } + + return false; + } + + //------------------------------------------------------------------------------ + Elf_Word add_attribute( std::string field, std::string value ) + { + Elf_Word current_position = 0; + + if ( modinfo_section ) { + // Strings are addeded to the end of the current section data + current_position = (Elf_Word)modinfo_section->get_size(); + + std::string attribute = field + "=" + value; + + modinfo_section->append_data( attribute + '\0' ); + content.push_back( + std::pair( field, value ) ); + } + + return current_position; + } + + //------------------------------------------------------------------------------ + private: + void process_section() + { + const char* pdata = modinfo_section->get_data(); + if ( pdata ) { + ELFIO::Elf_Xword i = 0; + while ( i < modinfo_section->get_size() ) { + while ( i < modinfo_section->get_size() && !pdata[i] ) + i++; + if ( i < modinfo_section->get_size() ) { + std::string info = pdata + i; + size_t loc = info.find( '=' ); + std::pair attribute( + info.substr( 0, loc ), info.substr( loc + 1 ) ); + + content.push_back( attribute ); + + i += info.length(); + } + } + } + } + + //------------------------------------------------------------------------------ + private: + S* modinfo_section; + std::vector> content; +}; + +using modinfo_section_accessor = modinfo_section_accessor_template
; +using const_modinfo_section_accessor = + modinfo_section_accessor_template; + +} // namespace ELFIO + +#endif // ELFIO_MODINFO_HPP + +/*** End of inlined file: elfio_modinfo.hpp ***/ + +#ifdef _MSC_VER +#pragma warning( pop ) +#endif + +#endif // ELFIO_HPP + +/*** End of inlined file: elfio.hpp ***/ + + +namespace ELFIO { + +static struct class_table_t +{ + const char key; + const char* str; +} class_table[] = { + { ELFCLASS32, "ELF32" }, + { ELFCLASS64, "ELF64" }, +}; + +static struct endian_table_t +{ + const char key; + const char* str; +} endian_table[] = { + { ELFDATANONE, "None" }, + { ELFDATA2LSB, "Little endian" }, + { ELFDATA2MSB, "Big endian" }, +}; + +static struct version_table_t +{ + const Elf64_Word key; + const char* str; +} version_table[] = { + { EV_NONE, "None" }, + { EV_CURRENT, "Current" }, +}; + +static struct type_table_t +{ + const Elf32_Half key; + const char* str; +} type_table[] = { + { ET_NONE, "No file type" }, { ET_REL, "Relocatable file" }, + { ET_EXEC, "Executable file" }, { ET_DYN, "Shared object file" }, + { ET_CORE, "Core file" }, +}; + +static struct machine_table_t +{ + const Elf64_Half key; + const char* str; +} machine_table[] = { + { EM_NONE, "No machine" }, + { EM_M32, "AT&T WE 32100" }, + { EM_SPARC, "SUN SPARC" }, + { EM_386, "Intel 80386" }, + { EM_68K, "Motorola m68k family" }, + { EM_88K, "Motorola m88k family" }, + { EM_486, "Intel 80486// Reserved for future use" }, + { EM_860, "Intel 80860" }, + { EM_MIPS, "MIPS R3000 (officially, big-endian only)" }, + { EM_S370, "IBM System/370" }, + { EM_MIPS_RS3_LE, + "MIPS R3000 little-endian (Oct 4 1999 Draft) Deprecated" }, + { EM_res011, "Reserved" }, + { EM_res012, "Reserved" }, + { EM_res013, "Reserved" }, + { EM_res014, "Reserved" }, + { EM_PARISC, "HPPA" }, + { EM_res016, "Reserved" }, + { EM_VPP550, "Fujitsu VPP500" }, + { EM_SPARC32PLUS, "Sun's v8plus" }, + { EM_960, "Intel 80960" }, + { EM_PPC, "PowerPC" }, + { EM_PPC64, "64-bit PowerPC" }, + { EM_S390, "IBM S/390" }, + { EM_SPU, "Sony/Toshiba/IBM SPU" }, + { EM_res024, "Reserved" }, + { EM_res025, "Reserved" }, + { EM_res026, "Reserved" }, + { EM_res027, "Reserved" }, + { EM_res028, "Reserved" }, + { EM_res029, "Reserved" }, + { EM_res030, "Reserved" }, + { EM_res031, "Reserved" }, + { EM_res032, "Reserved" }, + { EM_res033, "Reserved" }, + { EM_res034, "Reserved" }, + { EM_res035, "Reserved" }, + { EM_V800, "NEC V800 series" }, + { EM_FR20, "Fujitsu FR20" }, + { EM_RH32, "TRW RH32" }, + { EM_MCORE, "Motorola M*Core // May also be taken by Fujitsu MMA" }, + { EM_RCE, "Old name for MCore" }, + { EM_ARM, "ARM" }, + { EM_OLD_ALPHA, "Digital Alpha" }, + { EM_SH, "Renesas (formerly Hitachi) / SuperH SH" }, + { EM_SPARCV9, "SPARC v9 64-bit" }, + { EM_TRICORE, "Siemens Tricore embedded processor" }, + { EM_ARC, "ARC Cores" }, + { EM_H8_300, "Renesas (formerly Hitachi) H8/300" }, + { EM_H8_300H, "Renesas (formerly Hitachi) H8/300H" }, + { EM_H8S, "Renesas (formerly Hitachi) H8S" }, + { EM_H8_500, "Renesas (formerly Hitachi) H8/500" }, + { EM_IA_64, "Intel IA-64 Processor" }, + { EM_MIPS_X, "Stanford MIPS-X" }, + { EM_COLDFIRE, "Motorola Coldfire" }, + { EM_68HC12, "Motorola M68HC12" }, + { EM_MMA, "Fujitsu Multimedia Accelerator" }, + { EM_PCP, "Siemens PCP" }, + { EM_NCPU, "Sony nCPU embedded RISC processor" }, + { EM_NDR1, "Denso NDR1 microprocesspr" }, + { EM_STARCORE, "Motorola Star*Core processor" }, + { EM_ME16, "Toyota ME16 processor" }, + { EM_ST100, "STMicroelectronics ST100 processor" }, + { EM_TINYJ, "Advanced Logic Corp. TinyJ embedded processor" }, + { EM_X86_64, "Advanced Micro Devices X86-64 processor" }, + { EM_PDSP, "Sony DSP Processor" }, + { EM_PDP10, "Digital Equipment Corp. PDP-10" }, + { EM_PDP11, "Digital Equipment Corp. PDP-11" }, + { EM_FX66, "Siemens FX66 microcontroller" }, + { EM_ST9PLUS, "STMicroelectronics ST9+ 8/16 bit microcontroller" }, + { EM_ST7, "STMicroelectronics ST7 8-bit microcontroller" }, + { EM_68HC16, "Motorola MC68HC16 Microcontroller" }, + { EM_68HC11, "Motorola MC68HC11 Microcontroller" }, + { EM_68HC08, "Motorola MC68HC08 Microcontroller" }, + { EM_68HC05, "Motorola MC68HC05 Microcontroller" }, + { EM_SVX, "Silicon Graphics SVx" }, + { EM_ST19, "STMicroelectronics ST19 8-bit cpu" }, + { EM_VAX, "Digital VAX" }, + { EM_CRIS, "Axis Communications 32-bit embedded processor" }, + { EM_JAVELIN, "Infineon Technologies 32-bit embedded cpu" }, + { EM_FIREPATH, "Element 14 64-bit DSP processor" }, + { EM_ZSP, "LSI Logic's 16-bit DSP processor" }, + { EM_MMIX, "Donald Knuth's educational 64-bit processor" }, + { EM_HUANY, "Harvard's machine-independent format" }, + { EM_PRISM, "SiTera Prism" }, + { EM_AVR, "Atmel AVR 8-bit microcontroller" }, + { EM_FR30, "Fujitsu FR30" }, + { EM_D10V, "Mitsubishi D10V" }, + { EM_D30V, "Mitsubishi D30V" }, + { EM_V850, "NEC v850" }, + { EM_M32R, "Renesas M32R (formerly Mitsubishi M32R)" }, + { EM_MN10300, "Matsushita MN10300" }, + { EM_MN10200, "Matsushita MN10200" }, + { EM_PJ, "picoJava" }, + { EM_OPENRISC, "OpenRISC 32-bit embedded processor" }, + { EM_ARC_A5, "ARC Cores Tangent-A5" }, + { EM_XTENSA, "Tensilica Xtensa Architecture" }, + { EM_VIDEOCORE, "Alphamosaic VideoCore processor" }, + { EM_TMM_GPP, "Thompson Multimedia General Purpose Processor" }, + { EM_NS32K, "National Semiconductor 32000 series" }, + { EM_TPC, "Tenor Network TPC processor" }, + { EM_SNP1K, "Trebia SNP 1000 processor" }, + { EM_ST200, "STMicroelectronics ST200 microcontroller" }, + { EM_IP2K, "Ubicom IP2022 micro controller" }, + { EM_MAX, "MAX Processor" }, + { EM_CR, "National Semiconductor CompactRISC" }, + { EM_F2MC16, "Fujitsu F2MC16" }, + { EM_MSP430, "TI msp430 micro controller" }, + { EM_BLACKFIN, "ADI Blackfin" }, + { EM_SE_C33, "S1C33 Family of Seiko Epson processors" }, + { EM_SEP, "Sharp embedded microprocessor" }, + { EM_ARCA, "Arca RISC Microprocessor" }, + { EM_UNICORE, "Microprocessor series from PKU-Unity Ltd. and MPRC of " + "Peking University" }, + { EM_EXCESS, "eXcess: 16/32/64-bit configurable embedded CPU" }, + { EM_DXP, "Icera Semiconductor Inc. Deep Execution Processor" }, + { EM_ALTERA_NIOS2, "Altera Nios II soft-core processor" }, + { EM_CRX, "National Semiconductor CRX" }, + { EM_XGATE, "Motorola XGATE embedded processor" }, + { EM_C166, "Infineon C16x/XC16x processor" }, + { EM_M16C, "Renesas M16C series microprocessors" }, + { EM_DSPIC30F, "Microchip Technology dsPIC30F Digital Signal Controller" }, + { EM_CE, "Freescale Communication Engine RISC core" }, + { EM_M32C, "Renesas M32C series microprocessors" }, + { EM_res121, "Reserved" }, + { EM_res122, "Reserved" }, + { EM_res123, "Reserved" }, + { EM_res124, "Reserved" }, + { EM_res125, "Reserved" }, + { EM_res126, "Reserved" }, + { EM_res127, "Reserved" }, + { EM_res128, "Reserved" }, + { EM_res129, "Reserved" }, + { EM_res130, "Reserved" }, + { EM_TSK3000, "Altium TSK3000 core" }, + { EM_RS08, "Freescale RS08 embedded processor" }, + { EM_res133, "Reserved" }, + { EM_ECOG2, "Cyan Technology eCOG2 microprocessor" }, + { EM_SCORE, "Sunplus Score" }, + { EM_SCORE7, "Sunplus S+core7 RISC processor" }, + { EM_DSP24, "New Japan Radio (NJR) 24-bit DSP Processor" }, + { EM_VIDEOCORE3, "Broadcom VideoCore III processor" }, + { EM_LATTICEMICO32, "RISC processor for Lattice FPGA architecture" }, + { EM_SE_C17, "Seiko Epson C17 family" }, + { EM_TI_C6000, "Texas Instruments TMS320C6000 DSP family" }, + { EM_TI_C2000, "Texas Instruments TMS320C2000 DSP family" }, + { EM_TI_C5500, "Texas Instruments TMS320C55x DSP family" }, + { EM_res143, "Reserved" }, + { EM_res144, "Reserved" }, + { EM_res145, "Reserved" }, + { EM_res146, "Reserved" }, + { EM_res147, "Reserved" }, + { EM_res148, "Reserved" }, + { EM_res149, "Reserved" }, + { EM_res150, "Reserved" }, + { EM_res151, "Reserved" }, + { EM_res152, "Reserved" }, + { EM_res153, "Reserved" }, + { EM_res154, "Reserved" }, + { EM_res155, "Reserved" }, + { EM_res156, "Reserved" }, + { EM_res157, "Reserved" }, + { EM_res158, "Reserved" }, + { EM_res159, "Reserved" }, + { EM_MMDSP_PLUS, "STMicroelectronics 64bit VLIW Data Signal Processor" }, + { EM_CYPRESS_M8C, "Cypress M8C microprocessor" }, + { EM_R32C, "Renesas R32C series microprocessors" }, + { EM_TRIMEDIA, "NXP Semiconductors TriMedia architecture family" }, + { EM_QDSP6, "QUALCOMM DSP6 Processor" }, + { EM_8051, "Intel 8051 and variants" }, + { EM_STXP7X, "STMicroelectronics STxP7x family" }, + { EM_NDS32, + "Andes Technology compact code size embedded RISC processor family" }, + { EM_ECOG1, "Cyan Technology eCOG1X family" }, + { EM_ECOG1X, "Cyan Technology eCOG1X family" }, + { EM_MAXQ30, "Dallas Semiconductor MAXQ30 Core Micro-controllers" }, + { EM_XIMO16, "New Japan Radio (NJR) 16-bit DSP Processor" }, + { EM_MANIK, "M2000 Reconfigurable RISC Microprocessor" }, + { EM_CRAYNV2, "Cray Inc. NV2 vector architecture" }, + { EM_RX, "Renesas RX family" }, + { EM_METAG, "Imagination Technologies META processor architecture" }, + { EM_MCST_ELBRUS, "MCST Elbrus general purpose hardware architecture" }, + { EM_ECOG16, "Cyan Technology eCOG16 family" }, + { EM_CR16, "National Semiconductor CompactRISC 16-bit processor" }, + { EM_ETPU, "Freescale Extended Time Processing Unit" }, + { EM_SLE9X, "Infineon Technologies SLE9X core" }, + { EM_L1OM, "Intel L1OM" }, + { EM_INTEL181, "Reserved by Intel" }, + { EM_INTEL182, "Reserved by Intel" }, + { EM_res183, "Reserved by ARM" }, + { EM_res184, "Reserved by ARM" }, + { EM_AVR32, "Atmel Corporation 32-bit microprocessor family" }, + { EM_STM8, "STMicroeletronics STM8 8-bit microcontroller" }, + { EM_TILE64, "Tilera TILE64 multicore architecture family" }, + { EM_TILEPRO, "Tilera TILEPro multicore architecture family" }, + { EM_MICROBLAZE, "Xilinx MicroBlaze 32-bit RISC soft processor core" }, + { EM_CUDA, "NVIDIA CUDA architecture " }, +}; + +static struct section_type_table_t +{ + const Elf64_Half key; + const char* str; +} section_type_table[] = { + { SHT_NULL, "NULL" }, + { SHT_PROGBITS, "PROGBITS" }, + { SHT_SYMTAB, "SYMTAB" }, + { SHT_STRTAB, "STRTAB" }, + { SHT_RELA, "RELA" }, + { SHT_HASH, "HASH" }, + { SHT_DYNAMIC, "DYNAMIC" }, + { SHT_NOTE, "NOTE" }, + { SHT_NOBITS, "NOBITS" }, + { SHT_REL, "REL" }, + { SHT_SHLIB, "SHLIB" }, + { SHT_DYNSYM, "DYNSYM" }, + { SHT_INIT_ARRAY, "INIT_ARRAY" }, + { SHT_FINI_ARRAY, "FINI_ARRAY" }, + { SHT_PREINIT_ARRAY, "PREINIT_ARRAY" }, + { SHT_GROUP, "GROUP" }, + { SHT_SYMTAB_SHNDX, "SYMTAB_SHNDX " }, +}; + +static struct segment_type_table_t +{ + const Elf_Word key; + const char* str; +} segment_type_table[] = { + { PT_NULL, "NULL" }, { PT_LOAD, "LOAD" }, { PT_DYNAMIC, "DYNAMIC" }, + { PT_INTERP, "INTERP" }, { PT_NOTE, "NOTE" }, { PT_SHLIB, "SHLIB" }, + { PT_PHDR, "PHDR" }, { PT_TLS, "TLS" }, +}; + +static struct segment_flag_table_t +{ + const Elf_Word key; + const char* str; +} segment_flag_table[] = { + { 0, "" }, { 1, "X" }, { 2, "W" }, { 3, "WX" }, + { 4, "R" }, { 5, "RX" }, { 6, "RW" }, { 7, "RWX" }, +}; + +static struct symbol_bind_t +{ + const Elf_Word key; + const char* str; +} symbol_bind_table[] = { + { STB_LOCAL, "LOCAL" }, { STB_GLOBAL, "GLOBAL" }, + { STB_WEAK, "WEAK" }, { STB_LOOS, "LOOS" }, + { STB_HIOS, "HIOS" }, { STB_MULTIDEF, "MULTIDEF" }, + { STB_LOPROC, "LOPROC" }, { STB_HIPROC, "HIPROC" }, +}; + +static struct symbol_type_t +{ + const Elf_Word key; + const char* str; +} symbol_type_table[] = { + { STT_NOTYPE, "NOTYPE" }, { STT_OBJECT, "OBJECT" }, + { STT_FUNC, "FUNC" }, { STT_SECTION, "SECTION" }, + { STT_FILE, "FILE" }, { STT_COMMON, "COMMON" }, + { STT_TLS, "TLS" }, { STT_LOOS, "LOOS" }, + { STT_HIOS, "HIOS" }, { STT_LOPROC, "LOPROC" }, + { STT_HIPROC, "HIPROC" }, +}; + +static struct dynamic_tag_t +{ + const Elf_Word key; + const char* str; +} dynamic_tag_table[] = { + { DT_NULL, "NULL" }, + { DT_NEEDED, "NEEDED" }, + { DT_PLTRELSZ, "PLTRELSZ" }, + { DT_PLTGOT, "PLTGOT" }, + { DT_HASH, "HASH" }, + { DT_STRTAB, "STRTAB" }, + { DT_SYMTAB, "SYMTAB" }, + { DT_RELA, "RELA" }, + { DT_RELASZ, "RELASZ" }, + { DT_RELAENT, "RELAENT" }, + { DT_STRSZ, "STRSZ" }, + { DT_SYMENT, "SYMENT" }, + { DT_INIT, "INIT" }, + { DT_FINI, "FINI" }, + { DT_SONAME, "SONAME" }, + { DT_RPATH, "RPATH" }, + { DT_SYMBOLIC, "SYMBOLIC" }, + { DT_REL, "REL" }, + { DT_RELSZ, "RELSZ" }, + { DT_RELENT, "RELENT" }, + { DT_PLTREL, "PLTREL" }, + { DT_DEBUG, "DEBUG" }, + { DT_TEXTREL, "TEXTREL" }, + { DT_JMPREL, "JMPREL" }, + { DT_BIND_NOW, "BIND_NOW" }, + { DT_INIT_ARRAY, "INIT_ARRAY" }, + { DT_FINI_ARRAY, "FINI_ARRAY" }, + { DT_INIT_ARRAYSZ, "INIT_ARRAYSZ" }, + { DT_FINI_ARRAYSZ, "FINI_ARRAYSZ" }, + { DT_RUNPATH, "RUNPATH" }, + { DT_FLAGS, "FLAGS" }, + { DT_ENCODING, "ENCODING" }, + { DT_PREINIT_ARRAY, "PREINIT_ARRAY" }, + { DT_PREINIT_ARRAYSZ, "PREINIT_ARRAYSZ" }, + { DT_MAXPOSTAGS, "MAXPOSTAGS" }, +}; + +static const ELFIO::Elf_Xword MAX_DATA_ENTRIES = 64; + +//------------------------------------------------------------------------------ +class dump +{ +#define DUMP_DEC_FORMAT( width ) \ + std::setw( width ) << std::setfill( ' ' ) << std::dec << std::right +#define DUMP_HEX_FORMAT( width ) \ + std::setw( width ) << std::setfill( '0' ) << std::hex << std::right +#define DUMP_STR_FORMAT( width ) \ + std::setw( width ) << std::setfill( ' ' ) << std::hex << std::left + + public: + //------------------------------------------------------------------------------ + static void header( std::ostream& out, const elfio& reader ) + { + if ( !reader.get_header_size() ) { + return; + } + out << "ELF Header" << std::endl + << std::endl + << " Class: " << str_class( reader.get_class() ) << std::endl + << " Encoding: " << str_endian( reader.get_encoding() ) + << std::endl + << " ELFVersion: " << str_version( reader.get_elf_version() ) + << std::endl + << " Type: " << str_type( reader.get_type() ) << std::endl + << " Machine: " << str_machine( reader.get_machine() ) + << std::endl + << " Version: " << str_version( reader.get_version() ) + << std::endl + << " Entry: " + << "0x" << std::hex << reader.get_entry() << std::endl + << " Flags: " + << "0x" << std::hex << reader.get_flags() << std::endl + << std::endl; + } + + //------------------------------------------------------------------------------ + static void section_headers( std::ostream& out, const elfio& reader ) + { + Elf_Half n = reader.sections.size(); + + if ( n == 0 ) { + return; + } + + out << "Section Headers:" << std::endl; + if ( reader.get_class() == ELFCLASS32 ) { // Output for 32-bit + out << "[ Nr ] Type Addr Size ES Flg Lk Inf " + "Al Name" + << std::endl; + } + else { // Output for 64-bit + out << "[ Nr ] Type Addr Size " + " ES Flg" + << std::endl + << " Lk Inf Al Name" << std::endl; + } + + for ( Elf_Half i = 0; i < n; ++i ) { // For all sections + section* sec = reader.sections[i]; + section_header( out, i, sec, reader.get_class() ); + } + + out << "Key to Flags: W (write), A (alloc), X (execute)\n\n" + << std::endl; + } + + //------------------------------------------------------------------------------ + static void section_header( std::ostream& out, + Elf_Half no, + const section* sec, + unsigned char elf_class ) + { + std::ios_base::fmtflags original_flags = out.flags(); + + if ( elf_class == ELFCLASS32 ) { // Output for 32-bit + out << "[" << DUMP_DEC_FORMAT( 5 ) << no << "] " + << DUMP_STR_FORMAT( 17 ) << str_section_type( sec->get_type() ) + << " " << DUMP_HEX_FORMAT( 8 ) << sec->get_address() << " " + << DUMP_HEX_FORMAT( 8 ) << sec->get_size() << " " + << DUMP_HEX_FORMAT( 2 ) << sec->get_entry_size() << " " + << DUMP_STR_FORMAT( 3 ) << section_flags( sec->get_flags() ) + << " " << DUMP_HEX_FORMAT( 2 ) << sec->get_link() << " " + << DUMP_HEX_FORMAT( 3 ) << sec->get_info() << " " + << DUMP_HEX_FORMAT( 2 ) << sec->get_addr_align() << " " + << DUMP_STR_FORMAT( 17 ) << sec->get_name() << " " << std::endl; + } + else { // Output for 64-bit + out << "[" << DUMP_DEC_FORMAT( 5 ) << no << "] " + << DUMP_STR_FORMAT( 17 ) << str_section_type( sec->get_type() ) + << " " << DUMP_HEX_FORMAT( 16 ) << sec->get_address() << " " + << DUMP_HEX_FORMAT( 16 ) << sec->get_size() << " " + << DUMP_HEX_FORMAT( 4 ) << sec->get_entry_size() << " " + << DUMP_STR_FORMAT( 3 ) << section_flags( sec->get_flags() ) + << " " << std::endl + << " " << DUMP_HEX_FORMAT( 4 ) << sec->get_link() << " " + << DUMP_HEX_FORMAT( 4 ) << sec->get_info() << " " + << DUMP_HEX_FORMAT( 4 ) << sec->get_addr_align() << " " + << DUMP_STR_FORMAT( 17 ) << sec->get_name() << " " << std::endl; + } + + out.flags( original_flags ); + + return; + } + + //------------------------------------------------------------------------------ + static void segment_headers( std::ostream& out, const elfio& reader ) + { + Elf_Half n = reader.segments.size(); + if ( n == 0 ) { + return; + } + + out << "Segment headers:" << std::endl; + if ( reader.get_class() == ELFCLASS32 ) { // Output for 32-bit + out << "[ Nr ] Type VirtAddr PhysAddr FileSize Mem.Size " + "Flags Align" + << std::endl; + } + else { // Output for 64-bit + out << "[ Nr ] Type VirtAddr PhysAddr " + "Flags" + << std::endl + << " FileSize Mem.Size " + "Align" + << std::endl; + } + + for ( Elf_Half i = 0; i < n; ++i ) { + segment* seg = reader.segments[i]; + segment_header( out, i, seg, reader.get_class() ); + } + + out << std::endl; + } + + //------------------------------------------------------------------------------ + static void segment_header( std::ostream& out, + Elf_Half no, + const segment* seg, + unsigned int elf_class ) + { + std::ios_base::fmtflags original_flags = out.flags(); + + if ( elf_class == ELFCLASS32 ) { // Output for 32-bit + out << "[" << DUMP_DEC_FORMAT( 5 ) << no << "] " + << DUMP_STR_FORMAT( 14 ) << str_segment_type( seg->get_type() ) + << " " << DUMP_HEX_FORMAT( 8 ) << seg->get_virtual_address() + << " " << DUMP_HEX_FORMAT( 8 ) << seg->get_physical_address() + << " " << DUMP_HEX_FORMAT( 8 ) << seg->get_file_size() << " " + << DUMP_HEX_FORMAT( 8 ) << seg->get_memory_size() << " " + << DUMP_STR_FORMAT( 8 ) << str_segment_flag( seg->get_flags() ) + << " " << DUMP_HEX_FORMAT( 8 ) << seg->get_align() << " " + << std::endl; + } + else { // Output for 64-bit + out << "[" << DUMP_DEC_FORMAT( 5 ) << no << "] " + << DUMP_STR_FORMAT( 14 ) << str_segment_type( seg->get_type() ) + << " " << DUMP_HEX_FORMAT( 16 ) << seg->get_virtual_address() + << " " << DUMP_HEX_FORMAT( 16 ) << seg->get_physical_address() + << " " << DUMP_STR_FORMAT( 16 ) + << str_segment_flag( seg->get_flags() ) << " " << std::endl + << " " << DUMP_HEX_FORMAT( 16 ) + << seg->get_file_size() << " " << DUMP_HEX_FORMAT( 16 ) + << seg->get_memory_size() << " " << DUMP_HEX_FORMAT( 16 ) + << seg->get_align() << " " << std::endl; + } + + out.flags( original_flags ); + } + + //------------------------------------------------------------------------------ + static void symbol_tables( std::ostream& out, const elfio& reader ) + { + Elf_Half n = reader.sections.size(); + for ( Elf_Half i = 0; i < n; ++i ) { // For all sections + section* sec = reader.sections[i]; + if ( SHT_SYMTAB == sec->get_type() || + SHT_DYNSYM == sec->get_type() ) { + symbol_section_accessor symbols( reader, sec ); + + Elf_Xword sym_no = symbols.get_symbols_num(); + if ( sym_no > 0 ) { + out << "Symbol table (" << sec->get_name() << ")" + << std::endl; + if ( reader.get_class() == + ELFCLASS32 ) { // Output for 32-bit + out << "[ Nr ] Value Size Type Bind " + "Sect Name" + << std::endl; + } + else { // Output for 64-bit + out << "[ Nr ] Value Size Type " + " Bind Sect" + << std::endl + << " Name" << std::endl; + } + for ( Elf_Xword i = 0; i < sym_no; ++i ) { + std::string name; + Elf64_Addr value = 0; + Elf_Xword size = 0; + unsigned char bind = 0; + unsigned char type = 0; + Elf_Half section = 0; + unsigned char other = 0; + symbols.get_symbol( i, name, value, size, bind, type, + section, other ); + symbol_table( out, i, name, value, size, bind, type, + section, reader.get_class() ); + } + + out << std::endl; + } + } + } + } + + //------------------------------------------------------------------------------ + static void symbol_table( std::ostream& out, + Elf_Xword no, + std::string& name, + Elf64_Addr value, + Elf_Xword size, + unsigned char bind, + unsigned char type, + Elf_Half section, + unsigned int elf_class ) + { + std::ios_base::fmtflags original_flags = out.flags(); + + if ( elf_class == ELFCLASS32 ) { // Output for 32-bit + out << "[" << DUMP_DEC_FORMAT( 5 ) << no << "] " + << DUMP_HEX_FORMAT( 8 ) << value << " " << DUMP_HEX_FORMAT( 8 ) + << size << " " << DUMP_STR_FORMAT( 7 ) + << str_symbol_type( type ) << " " << DUMP_STR_FORMAT( 8 ) + << str_symbol_bind( bind ) << " " << DUMP_DEC_FORMAT( 5 ) + << section << " " << DUMP_STR_FORMAT( 1 ) << name << " " + << std::endl; + } + else { // Output for 64-bit + out << "[" << DUMP_DEC_FORMAT( 5 ) << no << "] " + << DUMP_HEX_FORMAT( 16 ) << value << " " + << DUMP_HEX_FORMAT( 16 ) << size << " " << DUMP_STR_FORMAT( 7 ) + << str_symbol_type( type ) << " " << DUMP_STR_FORMAT( 8 ) + << str_symbol_bind( bind ) << " " << DUMP_DEC_FORMAT( 5 ) + << section << " " << std::endl + << " " << DUMP_STR_FORMAT( 1 ) << name << " " + << std::endl; + } + + out.flags( original_flags ); + } + + //------------------------------------------------------------------------------ + static void notes( std::ostream& out, const elfio& reader ) + { + Elf_Half no = reader.sections.size(); + for ( Elf_Half i = 0; i < no; ++i ) { // For all sections + section* sec = reader.sections[i]; + if ( SHT_NOTE == sec->get_type() ) { // Look at notes + note_section_accessor notes( reader, sec ); + Elf_Word no_notes = notes.get_notes_num(); + if ( no > 0 ) { + out << "Note section (" << sec->get_name() << ")" + << std::endl + << " No Type Name" << std::endl; + for ( Elf_Word j = 0; j < no_notes; ++j ) { // For all notes + Elf_Word type; + std::string name; + void* desc; + Elf_Word descsz; + + if ( notes.get_note( j, type, name, desc, descsz ) ) { + // 'name' usually contains \0 at the end. Try to fix it + name = name.c_str(); + note( out, j, type, name ); + } + } + + out << std::endl; + } + } + } + } + + //------------------------------------------------------------------------------ + static void modinfo( std::ostream& out, const elfio& reader ) + { + Elf_Half no = reader.sections.size(); + for ( Elf_Half i = 0; i < no; ++i ) { // For all sections + section* sec = reader.sections[i]; + if ( ".modinfo" == sec->get_name() ) { // Look for the section + out << "Section .modinfo" << std::endl; + + const_modinfo_section_accessor modinfo( sec ); + for ( Elf_Word i = 0; i < modinfo.get_attribute_num(); i++ ) { + std::string field; + std::string value; + if ( modinfo.get_attribute( i, field, value ) ) { + out << " " << std::setw( 20 ) << field + << std::setw( 0 ) << " = " << value << std::endl; + } + } + + out << std::endl; + break; + } + } + } + + //------------------------------------------------------------------------------ + static void + note( std::ostream& out, int no, Elf_Word type, const std::string& name ) + { + out << " [" << DUMP_DEC_FORMAT( 2 ) << no << "] " + << DUMP_HEX_FORMAT( 8 ) << type << " " << DUMP_STR_FORMAT( 1 ) + << name << std::endl; + } + + //------------------------------------------------------------------------------ + static void dynamic_tags( std::ostream& out, const elfio& reader ) + { + Elf_Half n = reader.sections.size(); + for ( Elf_Half i = 0; i < n; ++i ) { // For all sections + section* sec = reader.sections[i]; + if ( SHT_DYNAMIC == sec->get_type() ) { + dynamic_section_accessor dynamic( reader, sec ); + + Elf_Xword dyn_no = dynamic.get_entries_num(); + if ( dyn_no > 0 ) { + out << "Dynamic section (" << sec->get_name() << ")" + << std::endl; + out << "[ Nr ] Tag Name/Value" << std::endl; + for ( Elf_Xword i = 0; i < dyn_no; ++i ) { + Elf_Xword tag = 0; + Elf_Xword value = 0; + std::string str; + dynamic.get_entry( i, tag, value, str ); + dynamic_tag( out, i, tag, value, str, + reader.get_class() ); + if ( DT_NULL == tag ) { + break; + } + } + + out << std::endl; + } + } + } + } + + //------------------------------------------------------------------------------ + static void dynamic_tag( std::ostream& out, + Elf_Xword no, + Elf_Xword tag, + Elf_Xword value, + std::string str, + unsigned int /*elf_class*/ ) + { + out << "[" << DUMP_DEC_FORMAT( 5 ) << no << "] " + << DUMP_STR_FORMAT( 16 ) << str_dynamic_tag( tag ) << " "; + if ( str.empty() ) { + out << DUMP_HEX_FORMAT( 16 ) << value << " "; + } + else { + out << DUMP_STR_FORMAT( 32 ) << str << " "; + } + out << std::endl; + } + + //------------------------------------------------------------------------------ + static void section_data( std::ostream& out, const section* sec ) + { + std::ios_base::fmtflags original_flags = out.flags(); + + out << sec->get_name() << std::endl; + const char* pdata = sec->get_data(); + if ( pdata ) { + ELFIO::Elf_Xword i; + for ( i = 0; i < std::min( sec->get_size(), MAX_DATA_ENTRIES ); + ++i ) { + if ( i % 16 == 0 ) { + out << "[" << DUMP_HEX_FORMAT( 8 ) << i << "]"; + } + + out << " " << DUMP_HEX_FORMAT( 2 ) << ( pdata[i] & 0x000000FF ); + + if ( i % 16 == 15 ) { + out << std::endl; + } + } + if ( i % 16 != 0 ) { + out << std::endl; + } + + out.flags( original_flags ); + } + + return; + } + + //------------------------------------------------------------------------------ + static void section_datas( std::ostream& out, const elfio& reader ) + { + Elf_Half n = reader.sections.size(); + + if ( n == 0 ) { + return; + } + + out << "Section Data:" << std::endl; + + for ( Elf_Half i = 1; i < n; ++i ) { // For all sections + section* sec = reader.sections[i]; + if ( sec->get_type() == SHT_NOBITS ) { + continue; + } + section_data( out, sec ); + } + + out << std::endl; + } + + //------------------------------------------------------------------------------ + static void + segment_data( std::ostream& out, Elf_Half no, const segment* seg ) + { + std::ios_base::fmtflags original_flags = out.flags(); + + out << "Segment # " << no << std::endl; + const char* pdata = seg->get_data(); + if ( pdata ) { + ELFIO::Elf_Xword i; + for ( i = 0; i < std::min( seg->get_file_size(), MAX_DATA_ENTRIES ); + ++i ) { + if ( i % 16 == 0 ) { + out << "[" << DUMP_HEX_FORMAT( 8 ) << i << "]"; + } + + out << " " << DUMP_HEX_FORMAT( 2 ) << ( pdata[i] & 0x000000FF ); + + if ( i % 16 == 15 ) { + out << std::endl; + } + } + if ( i % 16 != 0 ) { + out << std::endl; + } + + out.flags( original_flags ); + } + + return; + } + + //------------------------------------------------------------------------------ + static void segment_datas( std::ostream& out, const elfio& reader ) + { + Elf_Half n = reader.segments.size(); + + if ( n == 0 ) { + return; + } + + out << "Segment Data:" << std::endl; + + for ( Elf_Half i = 0; i < n; ++i ) { // For all sections + segment* seg = reader.segments[i]; + segment_data( out, i, seg ); + } + + out << std::endl; + } + + private: + //------------------------------------------------------------------------------ + template + std::string static find_value_in_table( const T& table, const K& key ) + { + std::string res = "?"; + for ( unsigned int i = 0; i < sizeof( table ) / sizeof( table[0] ); + ++i ) { + if ( table[i].key == key ) { + res = table[i].str; + break; + } + } + + return res; + } + + //------------------------------------------------------------------------------ + template + static std::string format_assoc( const T& table, const K& key ) + { + std::string str = find_value_in_table( table, key ); + if ( str == "?" ) { + std::ostringstream oss; + oss << str << " (0x" << std::hex << key << ")"; + str = oss.str(); + } + + return str; + } + + //------------------------------------------------------------------------------ + template + static std::string format_assoc( const T& table, const char key ) + { + return format_assoc( table, (const int)key ); + } + + //------------------------------------------------------------------------------ + static std::string section_flags( Elf_Xword flags ) + { + std::string ret = ""; + if ( flags & SHF_WRITE ) { + ret += "W"; + } + if ( flags & SHF_ALLOC ) { + ret += "A"; + } + if ( flags & SHF_EXECINSTR ) { + ret += "X"; + } + + return ret; + } + +//------------------------------------------------------------------------------ +#define STR_FUNC_TABLE( name ) \ + template static std::string str_##name( const T key ) \ + { \ + return format_assoc( name##_table, key ); \ + } + + STR_FUNC_TABLE( class ) + STR_FUNC_TABLE( endian ) + STR_FUNC_TABLE( version ) + STR_FUNC_TABLE( type ) + STR_FUNC_TABLE( machine ) + STR_FUNC_TABLE( section_type ) + STR_FUNC_TABLE( segment_type ) + STR_FUNC_TABLE( segment_flag ) + STR_FUNC_TABLE( symbol_bind ) + STR_FUNC_TABLE( symbol_type ) + STR_FUNC_TABLE( dynamic_tag ) + +#undef STR_FUNC_TABLE +#undef DUMP_DEC_FORMAT +#undef DUMP_HEX_FORMAT +#undef DUMP_STR_FORMAT +}; // class dump + +}; // namespace ELFIO + +#endif // ELFIO_DUMP_HPP + +/*** End of inlined file: elfio_dump.hpp ***/ + diff --git a/3rdparty/testutils/cpp-stub/stub.h b/3rdparty/testutils/cpp-stub/stub.h new file mode 100644 index 00000000..c5f2f53f --- /dev/null +++ b/3rdparty/testutils/cpp-stub/stub.h @@ -0,0 +1,360 @@ +#ifndef __STUB_H__ +#define __STUB_H__ + +#ifdef _WIN32 +//windows +#include +#include +#else +//linux +#include +#include +#include +#endif +//c +#include +#include +//c++ +#include + + +#define ADDR(CLASS_NAME,MEMBER_NAME) (&CLASS_NAME::MEMBER_NAME) + +/********************************************************** + replace function +**********************************************************/ +#ifdef _WIN32 +#define CACHEFLUSH(addr, size) FlushInstructionCache(GetCurrentProcess(), addr, size) +#else +#define CACHEFLUSH(addr, size) __builtin___clear_cache(addr, addr + size) +#endif + +#if defined(__aarch64__) || defined(_M_ARM64) + #define CODESIZE 16U + #define CODESIZE_MIN 16U + #define CODESIZE_MAX CODESIZE + // ldr x9, +8 + // br x9 + // addr + #define REPLACE_FAR(t, fn, fn_stub)\ + ((uint32_t*)fn)[0] = 0x58000040 | 9;\ + ((uint32_t*)fn)[1] = 0xd61f0120 | (9 << 5);\ + *(long long *)(fn + 8) = (long long )fn_stub;\ + CACHEFLUSH((char *)fn, CODESIZE); + #define REPLACE_NEAR(t, fn, fn_stub) REPLACE_FAR(t, fn, fn_stub) +#elif defined(__arm__) || defined(_M_ARM) + #define CODESIZE 8U + #define CODESIZE_MIN 8U + #define CODESIZE_MAX CODESIZE + // ldr pc, [pc, #-4] + #define REPLACE_FAR(t, fn, fn_stub)\ + ((uint32_t*)fn)[0] = 0xe51ff004;\ + ((uint32_t*)fn)[1] = (uint32_t)fn_stub;\ + CACHEFLUSH((char *)fn, CODESIZE); + #define REPLACE_NEAR(t, fn, fn_stub) REPLACE_FAR(t, fn, fn_stub) +#elif defined(__mips64) + #define CACHEFLUSH(addr, size) __builtin___clear_cache(addr, addr + size) + #define CODESIZE 80U + #define CODESIZE_MIN 80U + #define CODESIZE_MAX CODESIZE + //mips没有PC指针,所以需要手动入栈出栈 + //120000ce0: 67bdffe0 daddiu sp, sp, -32 //入栈 + //120000ce4: ffbf0018 sd ra, 24(sp) + //120000ce8: ffbe0010 sd s8, 16(sp) + //120000cec: ffbc0008 sd gp, 8(sp) + //120000cf0: 03a0f025 move s8, sp + + //120000d2c: 03c0e825 move sp, s8 //出栈 + //120000d30: dfbf0018 ld ra, 24(sp) + //120000d34: dfbe0010 ld s8, 16(sp) + //120000d38: dfbc0008 ld gp, 8(sp) + //120000d3c: 67bd0020 daddiu sp, sp, 32 + //120000d40: 03e00008 jr ra + + #define REPLACE_FAR(t, fn, fn_stub)\ + ((uint32_t *)fn)[0] = 0x67bdffe0;\ + ((uint32_t *)fn)[1] = 0xffbf0018;\ + ((uint32_t *)fn)[2] = 0xffbe0010;\ + ((uint32_t *)fn)[3] = 0xffbc0008;\ + ((uint32_t *)fn)[4] = 0x03a0f025;\ + *(uint16_t *)(fn + 20) = (long long)fn_stub >> 32;\ + *(fn + 22) = 0x19;\ + *(fn + 23) = 0x24;\ + ((uint32_t *)fn)[6] = 0x0019cc38;\ + *(uint16_t *)(fn + 28) = (long long)fn_stub >> 16;\ + *(fn + 30) = 0x39;\ + *(fn + 31) = 0x37;\ + ((uint32_t *)fn)[8] = 0x0019cc38;\ + *(uint16_t *)(fn + 36) = (long long)fn_stub;\ + *(fn + 38) = 0x39;\ + *(fn + 39) = 0x37;\ + ((uint32_t *)fn)[10] = 0x0320f809;\ + ((uint32_t *)fn)[11] = 0x00000000;\ + ((uint32_t *)fn)[12] = 0x00000000;\ + ((uint32_t *)fn)[13] = 0x03c0e825;\ + ((uint32_t *)fn)[14] = 0xdfbf0018;\ + ((uint32_t *)fn)[15] = 0xdfbe0010;\ + ((uint32_t *)fn)[16] = 0xdfbc0008;\ + ((uint32_t *)fn)[17] = 0x67bd0020;\ + ((uint32_t *)fn)[18] = 0x03e00008;\ + ((uint32_t *)fn)[19] = 0x00000000;\ + CACHEFLUSH((char *)fn, CODESIZE); + #define REPLACE_NEAR(t, fn, fn_stub) REPLACE_FAR(t, fn, fn_stub) +#elif defined(__thumb__) || defined(_M_THUMB) + #error "Thumb is not supported" +#else //__i386__ _x86_64__ + #define CODESIZE 13U + #define CODESIZE_MIN 5U + #define CODESIZE_MAX CODESIZE + //13 byte(jmp m16:64) + //movabs $0x102030405060708,%r11 + //jmpq *%r11 + static void REPLACE_FAR(void *t, char *fn, char *fn_stub) + { + *fn = 0x49; + *(fn + 1) = 0xbb; + *(long long *)(fn + 2) = (long long)fn_stub; + *(fn + 10) = 0x41; + *(fn + 11) = 0xff; + *(fn + 12) = 0xe3; + CACHEFLUSH((char *)fn, CODESIZE); + } + //5 byte(jmp rel32) + #define REPLACE_NEAR(t, fn, fn_stub)\ + *fn = 0xE9;\ + *(int *)(fn + 1) = (int)(fn_stub - fn - CODESIZE_MIN);\ + CACHEFLUSH((char *)fn, CODESIZE); +#endif + +struct func_stub +{ + char *fn; + unsigned char code_buf[CODESIZE]; + bool far_jmp; +}; + +class Stub +{ +public: + Stub() + { +#ifdef _WIN32 + SYSTEM_INFO sys_info; + GetSystemInfo(&sys_info); + m_pagesize = sys_info.dwPageSize; +#else + m_pagesize = sysconf(_SC_PAGE_SIZE); +#endif + + if (m_pagesize < 0) + { + m_pagesize = 4096; + } + } + ~Stub() + { + clear(); + } + + virtual void clear() + { + std::map::iterator iter; + struct func_stub *pstub; + for(iter=m_result.begin(); iter != m_result.end(); iter++) + { + pstub = iter->second; +#ifdef _WIN32 + DWORD lpflOldProtect; + if(0 != VirtualProtect(pageof(pstub->fn), m_pagesize * 2, PAGE_EXECUTE_READWRITE, &lpflOldProtect)) +#else + if (0 == mprotect(pageof(pstub->fn), m_pagesize * 2, PROT_READ | PROT_WRITE | PROT_EXEC)) +#endif + { + + if(pstub->far_jmp) + { + std::memcpy(pstub->fn, pstub->code_buf, CODESIZE_MAX); + } + else + { + std::memcpy(pstub->fn, pstub->code_buf, CODESIZE_MIN); + } + +#ifdef _WIN32 + VirtualProtect(pageof(pstub->fn), m_pagesize * 2, PAGE_EXECUTE_READ, &lpflOldProtect); +#else + CACHEFLUSH(pstub->fn,CODESIZE); + mprotect(pageof(pstub->fn), m_pagesize * 2, PROT_READ | PROT_EXEC); +#endif + } + + iter->second = NULL; + delete pstub; + } + + m_result.clear(); + return; + } + template + bool set(T addr, S addr_stub) + { + char * fn; + char * fn_stub; + fn = addrof(addr); + fn_stub = addrof(addr_stub); + struct func_stub *pstub; + std::map::iterator iter = m_result.find(fn); + + if (iter == m_result.end()) + { + pstub = new func_stub; + //start + pstub->fn = fn; + + if(distanceof(fn, fn_stub)) + { + pstub->far_jmp = true; + std::memcpy(pstub->code_buf, fn, CODESIZE_MAX); + } + else + { + pstub->far_jmp = false; + std::memcpy(pstub->code_buf, fn, CODESIZE_MIN); + } + } + else { + pstub = iter->second; + pstub->far_jmp = distanceof(fn, fn_stub); + } + + + +#ifdef _WIN32 + DWORD lpflOldProtect; + if(0 == VirtualProtect(pageof(pstub->fn), m_pagesize * 2, PAGE_EXECUTE_READWRITE, &lpflOldProtect)) +#else + if (-1 == mprotect(pageof(pstub->fn), static_cast(m_pagesize * 2), PROT_READ | PROT_WRITE | PROT_EXEC)) +#endif + { + throw("stub set memory protect to w+r+x faild"); + return false; + } + + if(pstub->far_jmp) + { + REPLACE_FAR(this, fn, fn_stub); + } + else + { + REPLACE_NEAR(this, fn, fn_stub); + } + + +#ifdef _WIN32 + if(0 == VirtualProtect(pageof(pstub->fn), m_pagesize * 2, PAGE_EXECUTE_READ, &lpflOldProtect)) +#else + if (-1 == mprotect(pageof(pstub->fn), m_pagesize * 2, PROT_READ | PROT_EXEC)) +#endif + { + throw("stub set memory protect to r+x failed"); + return false; + } + m_result.insert(std::pair(fn,pstub)); + return true; + } + + template + bool reset(T addr) + { + char * fn; + fn = addrof(addr); + + std::map::iterator iter = m_result.find(fn); + + if (iter == m_result.end()) + { + return true; + } + struct func_stub *pstub; + pstub = iter->second; + +#ifdef _WIN32 + DWORD lpflOldProtect; + if(0 == VirtualProtect(pageof(pstub->fn), m_pagesize * 2, PAGE_EXECUTE_READWRITE, &lpflOldProtect)) +#else + if (-1 == mprotect(pageof(pstub->fn), m_pagesize * 2, PROT_READ | PROT_WRITE | PROT_EXEC)) +#endif + { + throw("stub reset memory protect to w+r+x faild"); + return false; + } + + if(pstub->far_jmp) + { + std::memcpy(pstub->fn, pstub->code_buf, CODESIZE_MAX); + } + else + { + std::memcpy(pstub->fn, pstub->code_buf, CODESIZE_MIN); + } + +#ifdef _WIN32 + if(0 == VirtualProtect(pageof(pstub->fn), m_pagesize * 2, PAGE_EXECUTE_READ, &lpflOldProtect)) +#else + CACHEFLUSH(pstub->fn,CODESIZE); + if (-1 == mprotect(pageof(pstub->fn), m_pagesize * 2, PROT_READ | PROT_EXEC)) +#endif + { + throw("stub reset memory protect to r+x failed"); + return false; + } + + m_result.erase(iter); + delete pstub; + + return true; + } +protected: + char *pageof(char* addr) + { +#ifdef _WIN32 + return (char *)((unsigned long long)addr & ~(m_pagesize - 1)); +#else + return (char *)((unsigned long)addr & ~(m_pagesize - 1)); +#endif + } + + template + char* addrof(T addr) + { + union + { + T _s; + char* _d; + }ut; + ut._s = addr; + return ut._d; + } + + bool distanceof(char* addr, char* addr_stub) + { + std::ptrdiff_t diff = addr_stub >= addr ? addr_stub - addr : addr - addr_stub; + if((sizeof(addr) > 4) && (((diff >> 31) - 1) > 0)) + { + return true; + } + return false; + } + +protected: +#ifdef _WIN32 + //LLP64 + long long m_pagesize; +#else + //LP64 + long m_pagesize; +#endif + std::map m_result; +}; + +#endif diff --git a/3rdparty/testutils/stub-ext/stub-shadow.cpp b/3rdparty/testutils/stub-ext/stub-shadow.cpp new file mode 100644 index 00000000..bfac8c78 --- /dev/null +++ b/3rdparty/testutils/stub-ext/stub-shadow.cpp @@ -0,0 +1,35 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: MIT + +#include "stub-shadow.h" + +namespace stub_ext { + +WrapperMap stub_wrappers; + +Wrapper::Wrapper() +{ + +} + +Wrapper::~Wrapper() +{ + +} + +void freeWrapper(Wrapper *wrapper) +{ + if (!wrapper) + return; + + for (auto iter = stub_wrappers.begin(); iter != stub_wrappers.end();) { + if (iter->second == wrapper) + iter = stub_wrappers.erase(iter); + else + ++iter; + } + + delete wrapper; +} +} diff --git a/3rdparty/testutils/stub-ext/stub-shadow.h b/3rdparty/testutils/stub-ext/stub-shadow.h new file mode 100644 index 00000000..e137f35a --- /dev/null +++ b/3rdparty/testutils/stub-ext/stub-shadow.h @@ -0,0 +1,166 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: MIT + +#ifndef STUBSHADOW_H +#define STUBSHADOW_H + +#include +#include + +namespace stub_ext { + +#define LAMDA_FUNCTION_TYPE decltype(&Lamda::operator()) + +class Wrapper +{ +public: + Wrapper(); + virtual ~Wrapper(); +}; + +typedef std::unordered_map WrapperMap; +extern WrapperMap stub_wrappers; + +template +class LamdaWrapper : public Wrapper +{ +public: + LamdaWrapper(Lamda func): Wrapper(),_func(func){} + ~LamdaWrapper(){} + Lamda _func; +}; + +template +struct VFLocator +{ + +}; + +template +struct VFLocator +{ + typedef Ret (*Func)(Obj*, Args...); +}; + +template +struct VFLocator +{ + typedef Ret (*Func)(Obj*, Args...); +}; + +template +struct LamdaCaller +{ + +}; + +template +struct LamdaCaller +{ + template + static Ret call(LamdaWrapper *wrapper, OrgArgs&&... args) + { + return wrapper->_func(std::forward(args)...); + } +}; + +template +struct LamdaCaller +{ + template + static Ret call(LamdaWrapper *wrapper, OrgArgs&&... args) + { + return wrapper->_func(); + } +}; + +template +struct FuncShadow +{ + +}; + +template +struct FuncShadow +{ + typedef Ret (*Shadow)(Args...); + typedef Ret RetType; + + static Ret call(Args ...args) + { + Shadow shadow = &call; + long id = (long)shadow; + auto iter = stub_wrappers.find(id); + assert(stub_wrappers.find(id) != stub_wrappers.end()); + LamdaWrapper *wrapper = dynamic_cast *>(iter->second); + return LamdaCaller::call(wrapper, args...); + } +}; + +template +struct FuncShadow +{ + typedef Ret (*Shadow)(Args...); + typedef Ret RetType; + + static Ret call(Args ...args) + { + Shadow shadow = &call; + long id = (long)shadow; + auto iter = stub_wrappers.find(id); + assert(stub_wrappers.find(id) != stub_wrappers.end()); + LamdaWrapper *wrapper = dynamic_cast *>(iter->second); + return LamdaCaller::call(wrapper, args...); + } +}; + +template +struct FuncShadow +{ + typedef Ret (*Shadow)(Obj *,Args...); + typedef Ret RetType; + static Ret call(Obj *obj, Args ...args) + { + Shadow shadow = &call; + long id = (long)shadow; + auto iter = stub_wrappers.find(id); + assert(stub_wrappers.find(id) != stub_wrappers.end()); + LamdaWrapper *wrapper = dynamic_cast *>(iter->second); + return LamdaCaller::call(wrapper, obj, args...); + } +}; + + +template +struct FuncShadow +{ + typedef Ret (*Shadow)(Obj *,Args...); + typedef Ret RetType; + static Ret call(Obj *obj, Args ...args) + { + Shadow shadow = &call; + long id = (long)shadow; + auto iter = stub_wrappers.find(id); + assert(stub_wrappers.find(id) != stub_wrappers.end()); + LamdaWrapper *wrapper = dynamic_cast *>(iter->second); + return LamdaCaller::call(wrapper, obj, args...); + } +}; + +template +typename FuncShadow::Shadow depictShadow(Wrapper **wrapper, Func func, Lamda lamda) +{ + *wrapper = new LamdaWrapper(lamda); + typename FuncShadow::Shadow shadow = &FuncShadow::call; + long id = (long)shadow; + assert(stub_wrappers.find(id) == stub_wrappers.end()); + stub_wrappers.insert(std::make_pair(id,*wrapper)); + return shadow; +} + +void freeWrapper(Wrapper *wrapper); + +} + +#endif // STUBSHADOW_H diff --git a/3rdparty/testutils/stub-ext/stubext.h b/3rdparty/testutils/stub-ext/stubext.h new file mode 100644 index 00000000..33961a7c --- /dev/null +++ b/3rdparty/testutils/stub-ext/stubext.h @@ -0,0 +1,107 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: MIT + +#ifndef STUBEXT_H +#define STUBEXT_H + +//需修改Stub的私用成员函数和成员变量为保护类型 +#include "stub.h" + +#include "stub-shadow.h" + +#ifdef DEBUG_STUB_INVOKE +// use to make sure the stub function is invoked. +# define __DBG_STUB_INVOKE__ printf("stub at %s:%d is invoked.\n", __FILE__, __LINE__); +#else +# define __DBG_STUB_INVOKE__ +#endif + +#define VADDR(CLASS_NAME, MEMBER_NAME) (typename stub_ext::VFLocator::Func)(&CLASS_NAME::MEMBER_NAME) + +namespace stub_ext { + +class StubExt : public Stub +{ +public: + StubExt() + : Stub() { } + + template + bool set_lamda(T addr, Lamda lamda) + { + char *fn = addrof(addr); + if (m_result.find(fn) != m_result.end()) + reset(addr); + + Wrapper *wrapper = nullptr; + auto addr_stub = depictShadow(&wrapper, addr, lamda); + if (set(addr, addr_stub)) { + m_wrappers.insert(std::make_pair(fn, wrapper)); + return true; + } else { + freeWrapper(wrapper); + } + return false; + } + + template + void reset(T addr) + { + Stub::reset(addr); + char *fn = addrof(addr); + auto iter = m_wrappers.find(fn); + if (iter != m_wrappers.end()) { + freeWrapper(iter->second); + m_wrappers.erase(iter); + } + } + + ~StubExt() + { + clear(); + } + + void clear() override + { + Stub::clear(); + for (auto iter = m_wrappers.begin(); iter != m_wrappers.end(); ++iter) { + freeWrapper(iter->second); + } + m_wrappers.clear(); + } + + template + static void *get_ctor_addr(bool start = true) + { + // the start vairable must be true, or the compiler will optimize out. + if (start) goto Start; + Call_Constructor: + // This line of code will not be executed. + // The purpose of the code is to allow the compiler to generate the assembly code that calls the constructor. + T(); + Start: + // The address of the line of code T() obtained by assembly + char *p = (char *)&&Call_Constructor; // https://gcc.gnu.org/onlinedocs/gcc/Labels-as-Values.html + // CALL rel32 + void *ret = 0; + char pos; + char call = 0xe8; + do { + pos = *p; + if (pos == call) { + ret = p + 5 + (*(int *)(p + 1)); + } + + } while (!ret && (++p)); + + return ret; + } + +protected: + std::map m_wrappers; +}; + +} + +#endif // STUBEXT_H diff --git a/CMakeLists.txt b/CMakeLists.txt index 9ef64ce3..a4d26ddc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -43,9 +43,16 @@ include(GNUInstallDirs) add_subdirectory(${PROJECT_SOURCE_DIR}/src) -# Unit tests (requires Qt Test, enabled by default) -# option(BUILD_UNIT_TESTS is defined in autotests/CMakeLists.txt) -add_subdirectory(${PROJECT_SOURCE_DIR}/autotests) +# Unit tests (enabled by default, except in Release/MinSizeRel builds) +if(CMAKE_BUILD_TYPE STREQUAL "Release" OR CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") + option(BUILD_UNIT_TESTS "Build unit tests" OFF) +else() + option(BUILD_UNIT_TESTS "Build unit tests" ON) +endif() + +if(BUILD_UNIT_TESTS) + add_subdirectory(${PROJECT_SOURCE_DIR}/autotests) +endif() # Legacy tests (temporarily disabled) # add_subdirectory(${PROJECT_SOURCE_DIR}/tests) diff --git a/LICENSES/LGPL-3.0-or-later.txt b/LICENSES/LGPL-3.0-or-later.txt deleted file mode 100644 index 513d1c01..00000000 --- a/LICENSES/LGPL-3.0-or-later.txt +++ /dev/null @@ -1,304 +0,0 @@ -GNU LESSER GENERAL PUBLIC LICENSE -Version 3, 29 June 2007 - -Copyright (C) 2007 Free Software Foundation, Inc. - -Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. - -This version of the GNU Lesser General Public License incorporates the terms and conditions of version 3 of the GNU General Public License, supplemented by the additional permissions listed below. - -0. Additional Definitions. - -As used herein, "this License" refers to version 3 of the GNU Lesser General Public License, and the "GNU GPL" refers to version 3 of the GNU General Public License. - -"The Library" refers to a covered work governed by this License, other than an Application or a Combined Work as defined below. - -An "Application" is any work that makes use of an interface provided by the Library, but which is not otherwise based on the Library. Defining a subclass of a class defined by the Library is deemed a mode of using an interface provided by the Library. - -A "Combined Work" is a work produced by combining or linking an Application with the Library. The particular version of the Library with which the Combined Work was made is also called the "Linked Version". - -The "Minimal Corresponding Source" for a Combined Work means the Corresponding Source for the Combined Work, excluding any source code for portions of the Combined Work that, considered in isolation, are based on the Application, and not on the Linked Version. - -The "Corresponding Application Code" for a Combined Work means the object code and/or source code for the Application, including any data and utility programs needed for reproducing the Combined Work from the Application, but excluding the System Libraries of the Combined Work. - -1. Exception to Section 3 of the GNU GPL. -You may convey a covered work under sections 3 and 4 of this License without being bound by section 3 of the GNU GPL. - -2. Conveying Modified Versions. -If you modify a copy of the Library, and, in your modifications, a facility refers to a function or data to be supplied by an Application that uses the facility (other than as an argument passed when the facility is invoked), then you may convey a copy of the modified version: - - a) under this License, provided that you make a good faith effort to ensure that, in the event an Application does not supply the function or data, the facility still operates, and performs whatever part of its purpose remains meaningful, or - - b) under the GNU GPL, with none of the additional permissions of this License applicable to that copy. - -3. Object Code Incorporating Material from Library Header Files. -The object code form of an Application may incorporate material from a header file that is part of the Library. You may convey such object code under terms of your choice, provided that, if the incorporated material is not limited to numerical parameters, data structure layouts and accessors, or small macros, inline functions and templates (ten or fewer lines in length), you do both of the following: - - a) Give prominent notice with each copy of the object code that the Library is used in it and that the Library and its use are covered by this License. - - b) Accompany the object code with a copy of the GNU GPL and this license document. - -4. Combined Works. -You may convey a Combined Work under terms of your choice that, taken together, effectively do not restrict modification of the portions of the Library contained in the Combined Work and reverse engineering for debugging such modifications, if you also do each of the following: - - a) Give prominent notice with each copy of the Combined Work that the Library is used in it and that the Library and its use are covered by this License. - - b) Accompany the Combined Work with a copy of the GNU GPL and this license document. - - c) For a Combined Work that displays copyright notices during execution, include the copyright notice for the Library among these notices, as well as a reference directing the user to the copies of the GNU GPL and this license document. - - d) Do one of the following: - - 0) Convey the Minimal Corresponding Source under the terms of this License, and the Corresponding Application Code in a form suitable for, and under terms that permit, the user to recombine or relink the Application with a modified version of the Linked Version to produce a modified Combined Work, in the manner specified by section 6 of the GNU GPL for conveying Corresponding Source. - - 1) Use a suitable shared library mechanism for linking with the Library. A suitable mechanism is one that (a) uses at run time a copy of the Library already present on the user's computer system, and (b) will operate properly with a modified version of the Library that is interface-compatible with the Linked Version. - - e) Provide Installation Information, but only if you would otherwise be required to provide such information under section 6 of the GNU GPL, and only to the extent that such information is necessary to install and execute a modified version of the Combined Work produced by recombining or relinking the Application with a modified version of the Linked Version. (If you use option 4d0, the Installation Information must accompany the Minimal Corresponding Source and Corresponding Application Code. If you use option 4d1, you must provide the Installation Information in the manner specified by section 6 of the GNU GPL for conveying Corresponding Source.) - -5. Combined Libraries. -You may place library facilities that are a work based on the Library side by side in a single library together with other library facilities that are not Applications and are not covered by this License, and convey such a combined library under terms of your choice, if you do both of the following: - - a) Accompany the combined library with a copy of the same work based on the Library, uncombined with any other library facilities, conveyed under the terms of this License. - - b) Give prominent notice with the combined library that part of it is a work based on the Library, and explaining where to find the accompanying uncombined form of the same work. - -6. Revised Versions of the GNU Lesser General Public License. -The Free Software Foundation may publish revised and/or new versions of the GNU Lesser General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. - -Each version is given a distinguishing version number. If the Library as you received it specifies that a certain numbered version of the GNU Lesser General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that published version or of any later version published by the Free Software Foundation. If the Library as you received it does not specify a version number of the GNU Lesser General Public License, you may choose any version of the GNU Lesser General Public License ever published by the Free Software Foundation. - -If the Library as you received it specifies that a proxy can decide whether future versions of the GNU Lesser General Public License shall -apply, that proxy's public statement of acceptance of any version is permanent authorization for you to choose that version for the Library. - -GNU GENERAL PUBLIC LICENSE -Version 3, 29 June 2007 - -Copyright © 2007 Free Software Foundation, Inc. - -Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. - -Preamble - -The GNU General Public License is a free, copyleft license for software and other kinds of works. - -The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. We, the Free Software Foundation, use the GNU General Public License for most of our software; it applies also to any other work released this way by its authors. You can apply it to your programs, too. - -When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. - -To protect your rights, we need to prevent others from denying you these rights or asking you to surrender the rights. Therefore, you have certain responsibilities if you distribute copies of the software, or if you modify it: responsibilities to respect the freedom of others. - -For example, if you distribute copies of such a program, whether gratis or for a fee, you must pass on to the recipients the same freedoms that you received. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. - -Developers that use the GNU GPL protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License giving you legal permission to copy, distribute and/or modify it. - -For the developers' and authors' protection, the GPL clearly explains that there is no warranty for this free software. For both users' and authors' sake, the GPL requires that modified versions be marked as changed, so that their problems will not be attributed erroneously to authors of previous versions. - -Some devices are designed to deny users access to install or run modified versions of the software inside them, although the manufacturer can do so. This is fundamentally incompatible with the aim of protecting users' freedom to change the software. The systematic pattern of such abuse occurs in the area of products for individuals to use, which is precisely where it is most unacceptable. Therefore, we have designed this version of the GPL to prohibit the practice for those products. If such problems arise substantially in other domains, we stand ready to extend this provision to those domains in future versions of the GPL, as needed to protect the freedom of users. - -Finally, every program is threatened constantly by software patents. States should not allow patents to restrict development and use of software on general-purpose computers, but in those that do, we wish to avoid the special danger that patents applied to a free program could make it effectively proprietary. To prevent this, the GPL assures that patents cannot be used to render the program non-free. - -The precise terms and conditions for copying, distribution and modification follow. - -TERMS AND CONDITIONS - -0. Definitions. - -“This License” refers to version 3 of the GNU General Public License. - -“Copyright” also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. - -“The Program” refers to any copyrightable work licensed under this License. Each licensee is addressed as “you”. “Licensees” and “recipients” may be individuals or organizations. - -To “modify” a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a “modified version” of the earlier work or a work “based on” the earlier work. - -A “covered work” means either the unmodified Program or a work based on the Program. - -To “propagate” a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. - -To “convey” a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. - -An interactive user interface displays “Appropriate Legal Notices” to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. - -1. Source Code. -The “source code” for a work means the preferred form of the work for making modifications to it. “Object code” means any non-source form of a work. - -A “Standard Interface” means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. - -The “System Libraries” of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A “Major Component”, in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. - -The “Corresponding Source” for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. - -The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. - -The Corresponding Source for a work in source code form is that same work. - -2. Basic Permissions. -All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. - -You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. - -Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. - -3. Protecting Users' Legal Rights From Anti-Circumvention Law. -No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. - -When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. - -4. Conveying Verbatim Copies. -You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. - -You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. - -5. Conveying Modified Source Versions. -You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: - - a) The work must carry prominent notices stating that you modified it, and giving a relevant date. - - b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to “keep intact all notices”. - - c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. - - d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. - -A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an “aggregate” if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. - -6. Conveying Non-Source Forms. -You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: - - a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. - - b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. - - c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. - - d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. - - e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. - -A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. - -A “User Product” is either (1) a “consumer product”, which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, “normally used” refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. - -“Installation Information” for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. - -If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). - -The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. - -Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. - -7. Additional Terms. -“Additional permissions” are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. - -When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. - -Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: - - a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or - - b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or - - c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or - - d) Limiting the use for publicity purposes of names of licensors or authors of the material; or - - e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or - - f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. - -All other non-permissive additional terms are considered “further restrictions” within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. - -If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. - -Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. - -8. Termination. -You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). - -However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. - -Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. - -Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. - -9. Acceptance Not Required for Having Copies. -You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. - -10. Automatic Licensing of Downstream Recipients. -Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. - -An “entity transaction” is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. - -You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. - -11. Patents. -A “contributor” is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's “contributor version”. - -A contributor's “essential patent claims” are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, “control” includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. - -Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. - -In the following three paragraphs, a “patent license” is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To “grant” such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. - -If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. “Knowingly relying” means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. - -If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. - -A patent license is “discriminatory” if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. - -Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. - -12. No Surrender of Others' Freedom. -If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. - -13. Use with the GNU Affero General Public License. -Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU Affero General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the special requirements of the GNU Affero General Public License, section 13, concerning interaction through a network will apply to the combination as such. - -14. Revised Versions of this License. -The Free Software Foundation may publish revised and/or new versions of the GNU General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. - -Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU General Public License “or any later version” applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU General Public License, you may choose any version ever published by the Free Software Foundation. - -If the Program specifies that a proxy can decide which future versions of the GNU General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. - -Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. - -15. Disclaimer of Warranty. -THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM “AS IS” WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. - -16. Limitation of Liability. -IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. - -17. Interpretation of Sections 15 and 16. -If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. - -END OF TERMS AND CONDITIONS - -How to Apply These Terms to Your New Programs - -If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. - -To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the “copyright” line and a pointer to where the full notice is found. - - - Copyright (C) - - This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. - - This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along with this program. If not, see . - -Also add information on how to contact you by electronic and paper mail. - -If the program does terminal interaction, make it output a short notice like this when it starts in an interactive mode: - - Copyright (C) - This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. - This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. - -The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, your program's commands might be different; for a GUI interface, you would use an “about box”. - -You should also get your employer (if you work as a programmer) or school, if any, to sign a “copyright disclaimer” for the program, if necessary. For more information on this, and how to apply and follow the GNU GPL, see . - -The GNU General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. But first, please read . diff --git a/LICENSES/MIT.txt b/LICENSES/MIT.txt new file mode 100644 index 00000000..2071b23b --- /dev/null +++ b/LICENSES/MIT.txt @@ -0,0 +1,9 @@ +MIT License + +Copyright (c) + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/autotests/CMakeLists.txt b/autotests/CMakeLists.txt index ece3b9bd..d1d46e9f 100644 --- a/autotests/CMakeLists.txt +++ b/autotests/CMakeLists.txt @@ -3,9 +3,8 @@ cmake_minimum_required(VERSION 3.10) project(autotests) -# Enable/disable building of unit tests (default: ON) -option(BUILD_UNIT_TESTS "Build unit tests" ON) - +# BUILD_UNIT_TESTS is defined in top-level CMakeLists.txt; +# this guard acts as a safety net if autotests is somehow included directly if(NOT BUILD_UNIT_TESTS) return() endif() diff --git a/autotests/dfm-search-tests/CMakeLists.txt b/autotests/dfm-search-tests/CMakeLists.txt index 50d22144..4c60beed 100644 --- a/autotests/dfm-search-tests/CMakeLists.txt +++ b/autotests/dfm-search-tests/CMakeLists.txt @@ -24,10 +24,25 @@ target_link_libraries(dfm-search-test ${QT_TEST_LIB} ) +# Add size_parser source for testing (it's part of dfm-searcher client but we test it here) +target_sources(dfm-search-test PRIVATE + ${CMAKE_SOURCE_DIR}/src/dfm-search/dfm-search-client/size_parser.cpp + ${CMAKE_SOURCE_DIR}/3rdparty/testutils/stub-ext/stub-shadow.cpp +) + target_include_directories(dfm-search-test PRIVATE ${CMAKE_SOURCE_DIR}/src/dfm-search ${CMAKE_SOURCE_DIR}/src/dfm-search/dfm-search-lib + ${CMAKE_SOURCE_DIR}/src/dfm-search/dfm-search-client + ${CMAKE_SOURCE_DIR}/3rdparty/testutils/stub-ext + ${CMAKE_SOURCE_DIR}/3rdparty/testutils/cpp-stub +) + +# Pass source directory for locating rule files at runtime +target_compile_definitions(dfm-search-test + PRIVATE + TEST_SOURCE_DIR="${CMAKE_SOURCE_DIR}" ) # Register the test with CTest diff --git a/autotests/dfm-search-tests/main.cpp b/autotests/dfm-search-tests/main.cpp index 59a2f42e..81430b65 100644 --- a/autotests/dfm-search-tests/main.cpp +++ b/autotests/dfm-search-tests/main.cpp @@ -9,6 +9,19 @@ extern QObject *create_tst_DfmSearch(); extern QObject *create_tst_SearchUtils(); extern QObject *create_tst_TimeRangeFilter(); extern QObject *create_tst_TextSearchAPI(); +extern QObject *create_tst_RuleEngine(); +extern QObject *create_tst_TimeExtraction(); +extern QObject *create_tst_FileTypeExtraction(); +extern QObject *create_tst_KeywordExtraction(); +extern QObject *create_tst_ParsedIntent(); +extern QObject *create_tst_ChineseNLP(); +extern QObject *create_tst_SizeRangeFilter(); +extern QObject *create_tst_IsSemanticQuery(); +extern QObject *create_tst_SearchTarget(); +extern QObject *create_tst_SemanticQueryBuilderTarget(); +extern QObject *create_tst_ContentRetriever(); +extern QObject *create_tst_ContentSearchEngine(); +extern QObject *create_tst_FileNameSearchEngine(); int main(int argc, char *argv[]) { @@ -31,5 +44,57 @@ int main(int argc, char *argv[]) result |= QTest::qExec(testObj4, argc, argv); delete testObj4; + QObject *testObj5 = create_tst_RuleEngine(); + result |= QTest::qExec(testObj5, argc, argv); + delete testObj5; + + QObject *testObj6 = create_tst_TimeExtraction(); + result |= QTest::qExec(testObj6, argc, argv); + delete testObj6; + + QObject *testObj7 = create_tst_FileTypeExtraction(); + result |= QTest::qExec(testObj7, argc, argv); + delete testObj7; + + QObject *testObj8 = create_tst_KeywordExtraction(); + result |= QTest::qExec(testObj8, argc, argv); + delete testObj8; + + QObject *testObj9 = create_tst_ParsedIntent(); + result |= QTest::qExec(testObj9, argc, argv); + delete testObj9; + + QObject *testObj10 = create_tst_ChineseNLP(); + result |= QTest::qExec(testObj10, argc, argv); + delete testObj10; + + QObject *testObj11 = create_tst_SizeRangeFilter(); + result |= QTest::qExec(testObj11, argc, argv); + delete testObj11; + + QObject *testObj12 = create_tst_IsSemanticQuery(); + result |= QTest::qExec(testObj12, argc, argv); + delete testObj12; + + QObject *testObj13 = create_tst_SearchTarget(); + result |= QTest::qExec(testObj13, argc, argv); + delete testObj13; + + QObject *testObj14 = create_tst_SemanticQueryBuilderTarget(); + result |= QTest::qExec(testObj14, argc, argv); + delete testObj14; + + QObject *testObj15 = create_tst_ContentRetriever(); + result |= QTest::qExec(testObj15, argc, argv); + delete testObj15; + + QObject *testObj16 = create_tst_ContentSearchEngine(); + result |= QTest::qExec(testObj16, argc, argv); + delete testObj16; + + QObject *testObj17 = create_tst_FileNameSearchEngine(); + result |= QTest::qExec(testObj17, argc, argv); + delete testObj17; + return result; } diff --git a/autotests/dfm-search-tests/tst_chinese_nlp.cpp b/autotests/dfm-search-tests/tst_chinese_nlp.cpp new file mode 100644 index 00000000..65714652 --- /dev/null +++ b/autotests/dfm-search-tests/tst_chinese_nlp.cpp @@ -0,0 +1,1797 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#include +#include +#include +#include +#include +#include +#include + +#include "semantic/intentparser.h" +#include "semantic/semanticruleengine.h" + +using namespace DFMSEARCH; + +static QString rulesDir() +{ + return QStringLiteral(TEST_SOURCE_DIR) + QStringLiteral("/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN"); +} + +// Helper: compare two QStringList as sets (order-independent) +static bool setEquals(const QStringList &a, const QStringList &b) +{ + return QSet(a.begin(), a.end()) == QSet(b.begin(), b.end()); +} + +class tst_ChineseNLP : public QObject +{ + Q_OBJECT + +private: + SemanticRuleEngine *m_engine = nullptr; + IntentParser *m_parser = nullptr; + +private Q_SLOTS: + void initTestCase(); + void cleanupTestCase(); + void init(); + + // Time preset tests + void timePreset_today(); + void timePreset_today_alt(); + void timePreset_yesterday(); + void timePreset_yesterday_variants(); + void timePreset_dayBeforeYesterday(); + void timePreset_thisWeek_variants(); + void timePreset_lastWeek_variants(); + void timePreset_thisMonth_variants(); + void timePreset_lastMonth_variants(); + void timePreset_thisYear_variants(); + void timePreset_lastYear_variants(); + + // Time custom tests + void timeCustom_year(); + void timeCustom_year_twoDigit(); + void timeCustom_month(); + void timeCustom_yearMonth(); + void timeCustom_yearMonth_separators(); + void timeCustom_date(); + void timeCustom_dateSpoken(); + void timeCustom_fullDate(); + void timeCustom_fullDate_separators(); + void timeCustom_yesterday_variants_all(); + void timeCustom_lastYear_extra(); + + // File type tests + void fileType_precise_pdf(); + void fileType_precise_word(); + void fileType_precise_excel(); + void fileType_precise_ppt(); + void fileType_category_image_variants(); + void fileType_category_video_variants(); + void fileType_category_audio_variants(); + void fileType_category_archive(); + void fileType_category_application(); + void fileType_category_designSource(); + void fileType_general_document(); + void fileType_general_spreadsheet(); + void fileType_general_presentation(); + + // Filetype all-synonyms tests (from requirements) + void fileType_document_general_allSynonyms(); + void fileType_spreadsheet_general_allSynonyms(); + void filetype_presentation_general_allSynonyms(); + void fileType_image_allSynonyms(); + void fileType_video_allSynonyms(); + void fileType_audio_allSynonyms(); + void fileType_archive_allSynonyms(); + void fileType_application_allSynonyms(); + void fileType_design_source_allSynonyms(); + + // Combined time+type tests + void combined_fullDateAndType(); + void combined_monthAndType(); + void combined_yearAndType(); + + // Size tests + void size_fuzzy_large(); + void size_fuzzy_large_synonyms(); + void size_fuzzy_small(); + void size_dynamic_min(); + void size_dynamic_max(); + void size_dynamic_between(); + void size_chineseUnits_min(); + void size_chineseUnits_max(); + void size_chineseUnits_range(); + void size_noUnit_bytes(); + void size_combined_withTime(); + void size_combined_withType(); + void size_combined_full(); + void size_suffix_min(); + void size_suffix_max(); + void size_suffix_combined(); + void size_suffix_chineseUnits(); + + // Relative time tests + void timeRelative_justNow(); + void timeRelative_justNow_synonyms(); + void timeRelative_recentDays(); + void timeRelative_recentDays_synonyms(); + void timeRelative_pastFewDays(); + void timeRelative_pastFewDays_synonyms(); + void timeRelative_aWhileAgo(); + void timeRelative_aWhileAgo_synonyms(); + void timeRelative_priority_vs_preset(); + + // Dynamic relative time tests + void timeDynamic_recent_days(); + void timeDynamic_recent_hours(); + void timeDynamic_recent_weeks(); + void timeDynamic_recent_months(); + void timeDynamic_combined_noKeyword(); + void timeDynamic_combined_withType(); + void timeDynamic_chineseNumerals(); + + // Action behavior tests + void action_create_birthTime(); + void action_create_synonyms(); + void action_modify_modifyTime(); + void action_modify_synonyms(); + void action_default_unspecified(); + void action_combined_withTime_create(); + void action_combined_withTime_modify(); + + // Keyword tests + void keyword_contains_single(); + void keyword_contains_multi(); + void keyword_named(); + void keyword_contentHas(); + void keyword_contentHas_multi(); + + // Noise + unconsumed text tests + void noise_action_words(); + void noise_polite_words(); + void noise_suffix_words(); + + // Location tests + void location_desktop(); + void location_download(); + void location_documentsDir(); + void location_picturesDir(); + void location_musicDir(); + void location_videosDir(); + void location_trash(); + void location_deleted(); + void location_noLocation(); + void location_desktopAndDownload(); + + // End-to-end combined tests + void combined_timeAndFiletype(); + void combined_timeAndFiletype_multi(); + void combined_timeAndFiletype_all(); + void combined_timeAndKeyword(); + void combined_filetypeAndKeyword(); + void combined_timeAndFiletypeAndKeyword(); + void combined_noiseStripping(); + void combined_fullSentence(); + void combined_noTime(); + void combined_onlyKeyword(); + void combined_generalSuppressed(); + void combined_contentHasAndType(); +}; + +void tst_ChineseNLP::initTestCase() +{ + // Initialize QCoreApplication for Qt test framework + if (!QCoreApplication::instance()) { + int argc = 0; + new QCoreApplication(argc, nullptr); + } + + m_engine = new SemanticRuleEngine(this); + + // Load all 7 rule files + const QString dir = rulesDir(); + QVERIFY2(QDir(dir).exists(), qPrintable(QStringLiteral("Rules dir not found: ") + dir)); + + const QStringList files = { "noise_rules.json", "time_rules.json", + "filetype_rules.json", "keyword_rules.json", + "size_rules.json", "action_rules.json", + "location_rules.json" }; + for (const QString &f : files) { + const QString path = dir + QLatin1Char('/') + f; + bool ok = m_engine->loadRuleFile(path); + QVERIFY2(ok, qPrintable(QStringLiteral("Failed to load: ") + path)); + } + + // Verify all groups loaded + QVERIFY(m_engine->hasGroup("time")); + QVERIFY(m_engine->hasGroup("filetype")); + QVERIFY(m_engine->hasGroup("keyword")); + QVERIFY(m_engine->hasGroup("noise")); + QVERIFY(m_engine->hasGroup("size")); + QVERIFY(m_engine->hasGroup("action")); + QVERIFY(m_engine->hasGroup("location")); + + const QStringList groups = m_engine->groupNames(); + QCOMPARE(groups.size(), 7); + + m_parser = new IntentParser(m_engine); + + // Verify default extractors are initialized + QStringList names = m_parser->extractorNames(); + QCOMPARE(names.size(), 6); + QVERIFY(names.contains("time")); + QVERIFY(names.contains("filetype")); + QVERIFY(names.contains("size")); + QVERIFY(names.contains("action")); + QVERIFY(names.contains("location")); + QVERIFY(names.contains("keyword")); +} + +void tst_ChineseNLP::cleanupTestCase() +{ + delete m_parser; + m_parser = nullptr; +} + +void tst_ChineseNLP::init() +{ + // Each test gets a fresh parse — no shared state between tests +} + +// ===== Time Preset Tests ===== + +void tst_ChineseNLP::timePreset_today() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("今天的文件"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::Today); +} + +void tst_ChineseNLP::timePreset_today_alt() +{ + // 今日 and 今日份 + ParsedIntent intent1; + m_parser->parse(QStringLiteral("今日的文档"), intent1); + QCOMPARE(intent1.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent1.timeConstraint.preset, TimePreset::Today); + + ParsedIntent intent2; + m_parser->parse(QStringLiteral("今日份图片"), intent2); + QCOMPARE(intent2.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent2.timeConstraint.preset, TimePreset::Today); +} + +void tst_ChineseNLP::timePreset_yesterday() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("昨天的文档"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::Yesterday); +} + +void tst_ChineseNLP::timePreset_yesterday_variants() +{ + const QStringList inputs = { QStringLiteral("昨日"), QStringLiteral("昨晚"), + QStringLiteral("昨天上午"), QStringLiteral("昨天下午"), + QStringLiteral("昨天晚上") }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input, intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::Yesterday); + } +} + +void tst_ChineseNLP::timePreset_dayBeforeYesterday() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("前天的图片"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::DayBeforeYesterday); +} + +void tst_ChineseNLP::timePreset_thisWeek_variants() +{ + const QStringList inputs = { QStringLiteral("本周"), QStringLiteral("这周"), + QStringLiteral("这个星期"), QStringLiteral("这一个星期") }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input, intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::ThisWeek); + } +} + +void tst_ChineseNLP::timePreset_lastWeek_variants() +{ + const QStringList inputs = { QStringLiteral("上周"), QStringLiteral("上个星期"), + QStringLiteral("上星期"), QStringLiteral("上一个星期") }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input, intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::LastWeek); + } +} + +void tst_ChineseNLP::timePreset_thisMonth_variants() +{ + const QStringList inputs = { QStringLiteral("本月"), QStringLiteral("这个月"), + QStringLiteral("当月") }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input, intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::ThisMonth); + } +} + +void tst_ChineseNLP::timePreset_lastMonth_variants() +{ + const QStringList inputs = { QStringLiteral("上个月"), QStringLiteral("上月") }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input, intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::LastMonth); + } +} + +void tst_ChineseNLP::timePreset_thisYear_variants() +{ + const QStringList inputs = { QStringLiteral("今年"), QStringLiteral("本年"), + QStringLiteral("这年") }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input, intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::ThisYear); + } +} + +void tst_ChineseNLP::timePreset_lastYear_variants() +{ + const QStringList inputs = { QStringLiteral("去年"), QStringLiteral("上一年") }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input, intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::LastYear); + } +} + +// ===== Time Custom Tests ===== + +void tst_ChineseNLP::timeCustom_year() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("2025年的文档"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Custom); + QCOMPARE(intent.timeConstraint.customStart.date().year(), 2025); + QCOMPARE(intent.timeConstraint.customStart.date().month(), 1); + QCOMPARE(intent.timeConstraint.customStart.date().day(), 1); + QCOMPARE(intent.timeConstraint.customEnd.date().year(), 2025); + QCOMPARE(intent.timeConstraint.customEnd.date().month(), 12); + QCOMPARE(intent.timeConstraint.customEnd.date().day(), 31); +} + +void tst_ChineseNLP::timeCustom_year_twoDigit() +{ + // Two-digit year: 25 -> 2025 + ParsedIntent intent; + m_parser->parse(QStringLiteral("25年的文档"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Custom); + QCOMPARE(intent.timeConstraint.customStart.date().year(), 2025); + QCOMPARE(intent.timeConstraint.customEnd.date().year(), 2025); +} + +// ===== File Type Tests ===== + +void tst_ChineseNLP::fileType_precise_pdf() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("pdf"), intent); + QVERIFY(intent.fileExtensions.contains("pdf")); + QCOMPARE(intent.fileExtensions.size(), 1); +} + +void tst_ChineseNLP::fileType_precise_word() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("word"), intent); + QVERIFY(setEquals(intent.fileExtensions, QStringList { "doc", "docx" })); +} + +void tst_ChineseNLP::fileType_precise_excel() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("excel"), intent); + QVERIFY(setEquals(intent.fileExtensions, QStringList { "xls", "xlsx" })); +} + +void tst_ChineseNLP::fileType_precise_ppt() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("ppt"), intent); + QVERIFY(setEquals(intent.fileExtensions, QStringList { "ppt", "pptx" })); +} + +void tst_ChineseNLP::fileType_category_image_variants() +{ + const QStringList inputs = { QStringLiteral("图片"), QStringLiteral("照片"), + QStringLiteral("截图"), QStringLiteral("壁纸"), + QStringLiteral("海报"), QStringLiteral("相片"), + QStringLiteral("表情包"), QStringLiteral("图") }; + const QStringList expectedExts = { "jpg", "jpeg", "png", "gif", "bmp", "webp", "svg" }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input, intent); + QVERIFY2(setEquals(intent.fileExtensions, expectedExts), + qPrintable(QStringLiteral("Failed for input: ") + input)); + } +} + +void tst_ChineseNLP::fileType_category_video_variants() +{ + const QStringList inputs = { QStringLiteral("视频"), QStringLiteral("录像"), + QStringLiteral("电影"), QStringLiteral("动画"), + QStringLiteral("短片"), QStringLiteral("片子") }; + const QStringList expectedExts = { "mp4", "avi", "mkv", "mov", "flv", "wmv", "webm" }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input, intent); + QVERIFY2(setEquals(intent.fileExtensions, expectedExts), + qPrintable(QStringLiteral("Failed for input: ") + input)); + } +} + +void tst_ChineseNLP::fileType_category_audio_variants() +{ + const QStringList inputs = { QStringLiteral("音频"), QStringLiteral("音乐"), + QStringLiteral("录音"), QStringLiteral("歌"), + QStringLiteral("语音") }; + const QStringList expectedExts = { "mp3", "wav", "flac", "aac", "ogg", "m4a" }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input, intent); + QVERIFY2(setEquals(intent.fileExtensions, expectedExts), + qPrintable(QStringLiteral("Failed for input: ") + input)); + } +} + +void tst_ChineseNLP::fileType_category_archive() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("压缩包"), intent); + QVERIFY(intent.fileExtensions.contains("zip")); + QVERIFY(intent.fileExtensions.contains("tar.gz")); + QVERIFY(intent.fileExtensions.contains("rar")); + QVERIFY(intent.fileExtensions.contains("7z")); +} + +void tst_ChineseNLP::fileType_category_application() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("安装包"), intent); + QVERIFY(intent.fileExtensions.contains("deb")); + QVERIFY(intent.fileExtensions.contains("sh")); +} + +void tst_ChineseNLP::fileType_category_designSource() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("源文件"), intent); + QVERIFY(intent.fileExtensions.contains("psd")); + QVERIFY(intent.fileExtensions.contains("ai")); +} + +void tst_ChineseNLP::fileType_general_document() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("文档"), intent); + const QStringList expectedExts = { "doc", "docx", "pdf", "txt", "wps", "rtf", "md", "odt" }; + QVERIFY(setEquals(intent.fileExtensions, expectedExts)); +} + +void tst_ChineseNLP::fileType_general_spreadsheet() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("表格"), intent); + QVERIFY(intent.fileExtensions.contains("xls")); + QVERIFY(intent.fileExtensions.contains("xlsx")); + QVERIFY(intent.fileExtensions.contains("csv")); +} + +void tst_ChineseNLP::fileType_general_presentation() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("幻灯片"), intent); + QVERIFY(intent.fileExtensions.contains("ppt")); + QVERIFY(intent.fileExtensions.contains("pptx")); +} + +// ===== Keyword Tests ===== + +void tst_ChineseNLP::keyword_contains_single() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("包含会议记录的文档"), intent); + QCOMPARE(intent.keywords.size(), 1); + QCOMPARE(intent.keywords.first(), QString("会议记录")); +} + +void tst_ChineseNLP::keyword_contains_multi() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("包含预算和收入的报告"), intent); + QCOMPARE(intent.keywords.size(), 2); + QVERIFY(intent.keywords.contains("预算")); + QVERIFY(intent.keywords.contains("收入")); +} + +void tst_ChineseNLP::keyword_named() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("名为方案A的文档"), intent); + QCOMPARE(intent.keywords.size(), 1); + QCOMPARE(intent.keywords.first(), QString("方案A")); +} + +void tst_ChineseNLP::keyword_contentHas() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("内容包含数据分析的报告"), intent); + QCOMPARE(intent.keywords.size(), 1); + QCOMPARE(intent.keywords.first(), QString("数据分析")); +} + +void tst_ChineseNLP::keyword_contentHas_multi() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("内容含有产品规划和市场调研的报告"), intent); + QCOMPARE(intent.keywords.size(), 2); + QVERIFY(intent.keywords.contains("产品规划")); + QVERIFY(intent.keywords.contains("市场调研")); +} + +// ===== Noise + Unconsumed Text Tests ===== + +void tst_ChineseNLP::noise_action_words() +{ + // "搜索" is noise; "上周" is time; "图片" is filetype + ParsedIntent intent; + m_parser->parse(QStringLiteral("搜索上周的图片"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::LastWeek); + // Filetype should be matched + QVERIFY(!intent.fileExtensions.isEmpty()); + // No keywords since all text is consumed + QVERIFY(intent.keywords.isEmpty()); +} + +void tst_ChineseNLP::noise_polite_words() +{ + // "请帮我找" consumed as noise; "今天" time; "文档" filetype + ParsedIntent intent; + m_parser->parse(QStringLiteral("请帮我找今天的文档"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::Today); + QVERIFY(!intent.fileExtensions.isEmpty()); + // All text consumed by noise + time + filetype + QVERIFY(intent.keywords.isEmpty()); +} + +void tst_ChineseNLP::noise_suffix_words() +{ + // "昨天上午" time; "的照片" noise_suffix + ParsedIntent intent; + m_parser->parse(QStringLiteral("昨天上午的照片"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::Yesterday); + QVERIFY(!intent.fileExtensions.isEmpty()); + QVERIFY(intent.keywords.isEmpty()); +} + +// ===== End-to-End Combined Tests ===== + +void tst_ChineseNLP::combined_timeAndFiletype() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("今天的图片"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::Today); + const QStringList imageExts = { "jpg", "jpeg", "png", "gif", "bmp", "webp", "svg" }; + QVERIFY(setEquals(intent.fileExtensions, imageExts)); + // "的" is consumed by noise_suffix "的图片" + QVERIFY(intent.keywords.isEmpty()); +} + +void tst_ChineseNLP::combined_timeAndFiletype_multi() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("今天的图片和视频"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::Today); + // Should contain both image and video extensions + QVERIFY(intent.fileExtensions.contains("jpg")); + QVERIFY(intent.fileExtensions.contains("png")); + QVERIFY(intent.fileExtensions.contains("mp4")); + QVERIFY(intent.fileExtensions.contains("mkv")); + QVERIFY(intent.fileExtensions.contains("avi")); + QVERIFY(intent.keywords.isEmpty()); +} + +void tst_ChineseNLP::combined_timeAndFiletype_all() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("今天的图片和视频和音频"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::Today); + QVERIFY(intent.fileExtensions.contains("jpg")); + QVERIFY(intent.fileExtensions.contains("mp4")); + QVERIFY(intent.fileExtensions.contains("mp3")); + QVERIFY(intent.keywords.isEmpty()); +} + +void tst_ChineseNLP::combined_timeAndKeyword() +{ + // "今天" time, "包含会议记录的" keyword pattern, "文档" filetype + // But since keyword pattern matches, filetype_document_general is skipped + // because keyword_extractor returns early. The filetype extractor runs + // before keyword extractor and matches "文档". + ParsedIntent intent; + m_parser->parse(QStringLiteral("今天包含会议记录的文档"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::Today); + QVERIFY(intent.keywords.contains("会议记录")); + // "文档" matches filetype_document_general + QVERIFY(!intent.fileExtensions.isEmpty()); +} + +void tst_ChineseNLP::combined_filetypeAndKeyword() +{ + // "名为方案A的" → keyword "方案A"; "pdf" → filetype + ParsedIntent intent; + m_parser->parse(QStringLiteral("名为方案A的pdf"), intent); + QVERIFY(intent.fileExtensions.contains("pdf")); + QCOMPARE(intent.keywords.size(), 1); + QCOMPARE(intent.keywords.first(), QString("方案A")); +} + +void tst_ChineseNLP::combined_timeAndFiletypeAndKeyword() +{ + // "昨天" time, "视频" filetype (priority 150, non-general), + // "包含报告的" keyword → "报告" + // Note: "报告" also matches filetype_document_general but it's general + // and gets skipped since video exts are already in seenExtensions + ParsedIntent intent; + m_parser->parse(QStringLiteral("昨天的视频和包含报告的文档"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::Yesterday); + // Video extensions + QVERIFY(intent.fileExtensions.contains("mp4")); + QVERIFY(intent.fileExtensions.contains("avi")); + // Keyword extracted from structured pattern + QCOMPARE(intent.keywords.size(), 1); + QCOMPARE(intent.keywords.first(), QString("报告")); +} + +void tst_ChineseNLP::combined_noiseStripping() +{ + // "帮我找" noise_action, "今天" time, "会议" unconsumed → keyword, "文档" filetype + ParsedIntent intent; + m_parser->parse(QStringLiteral("帮我找今天的会议文档"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::Today); + QVERIFY(!intent.fileExtensions.isEmpty()); + QCOMPARE(intent.keywords.size(), 1); + QCOMPARE(intent.keywords.first(), QString("会议")); +} + +void tst_ChineseNLP::combined_fullSentence() +{ + // "请搜索上周的图片和视频" → noise(请,搜索) + time(上周) + filetype(图片,视频) + ParsedIntent intent; + m_parser->parse(QStringLiteral("请搜索上周的图片和视频"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::LastWeek); + QVERIFY(intent.fileExtensions.contains("jpg")); + QVERIFY(intent.fileExtensions.contains("mp4")); + QVERIFY(intent.keywords.isEmpty()); +} + +void tst_ChineseNLP::combined_noTime() +{ + // No time, keyword from "包含数据", filetype from "表格" + ParsedIntent intent; + m_parser->parse(QStringLiteral("包含数据的表格"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::None); + QVERIFY(!intent.fileExtensions.isEmpty()); + QVERIFY(intent.fileExtensions.contains("xls")); + QCOMPARE(intent.keywords.size(), 1); + QCOMPARE(intent.keywords.first(), QString("数据")); +} + +void tst_ChineseNLP::combined_onlyKeyword() +{ + // No time, no filetype, only unconsumed text as keyword + ParsedIntent intent; + m_parser->parse(QStringLiteral("会议记录"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::None); + QVERIFY(intent.fileExtensions.isEmpty()); + QCOMPARE(intent.keywords.size(), 1); + QCOMPARE(intent.keywords.first(), QString("会议记录")); +} + +void tst_ChineseNLP::combined_generalSuppressed() +{ + // "pdf" precise (priority 200) wins; "文档" general (priority 100) suppressed + ParsedIntent intent; + m_parser->parse(QStringLiteral("pdf文档"), intent); + QCOMPARE(intent.fileExtensions.size(), 1); + QCOMPARE(intent.fileExtensions.first(), QString("pdf")); +} + +void tst_ChineseNLP::combined_contentHasAndType() +{ + // "内容包含测试的报告" → keyword "测试", filetype "报告" (document general) + ParsedIntent intent; + m_parser->parse(QStringLiteral("内容包含测试的报告"), intent); + QVERIFY(intent.keywords.contains("测试")); + QVERIFY(!intent.fileExtensions.isEmpty()); + // "报告" is in filetype_document_general pattern + QVERIFY(intent.fileExtensions.contains("doc")); + QVERIFY(intent.fileExtensions.contains("pdf")); +} + +// ===== New Time Custom Tests ===== + +void tst_ChineseNLP::timeCustom_month() +{ + // "12月" → this month 12 + ParsedIntent intent; + m_parser->parse(QStringLiteral("12月的文档"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Custom); + QCOMPARE(intent.timeConstraint.customStart.date().month(), 12); + QCOMPARE(intent.timeConstraint.customStart.date().day(), 1); + // End should be last day of December + QCOMPARE(intent.timeConstraint.customEnd.date().month(), 12); + + // "5月份" — same month, different syntax + ParsedIntent intent2; + m_parser->parse(QStringLiteral("5月份的图片"), intent2); + QCOMPARE(intent2.timeConstraint.kind, TimeConstraintKind::Custom); + QCOMPARE(intent2.timeConstraint.customStart.date().month(), 5); +} + +void tst_ChineseNLP::timeCustom_yearMonth() +{ + // "2025年12月" → year=2025, month=12 + ParsedIntent intent; + m_parser->parse(QStringLiteral("2025年12月的文档"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Custom); + QCOMPARE(intent.timeConstraint.customStart.date().year(), 2025); + QCOMPARE(intent.timeConstraint.customStart.date().month(), 12); + QCOMPARE(intent.timeConstraint.customStart.date().day(), 1); + QCOMPARE(intent.timeConstraint.customEnd.date().year(), 2025); + QCOMPARE(intent.timeConstraint.customEnd.date().month(), 12); +} + +void tst_ChineseNLP::timeCustom_yearMonth_separators() +{ + // "2025-12" — dash separator + ParsedIntent intent1; + m_parser->parse(QStringLiteral("2025-12的图片"), intent1); + QCOMPARE(intent1.timeConstraint.kind, TimeConstraintKind::Custom); + QCOMPARE(intent1.timeConstraint.customStart.date().year(), 2025); + QCOMPARE(intent1.timeConstraint.customStart.date().month(), 12); + + // "2025/12" — slash separator + ParsedIntent intent2; + m_parser->parse(QStringLiteral("2025/12的视频"), intent2); + QCOMPARE(intent2.timeConstraint.kind, TimeConstraintKind::Custom); + QCOMPARE(intent2.timeConstraint.customStart.date().year(), 2025); + QCOMPARE(intent2.timeConstraint.customStart.date().month(), 12); + + // "25.12" — dot separator, 2-digit year + ParsedIntent intent3; + m_parser->parse(QStringLiteral("25.12的文件"), intent3); + QCOMPARE(intent3.timeConstraint.kind, TimeConstraintKind::Custom); + QCOMPARE(intent3.timeConstraint.customStart.date().year(), 2025); + QCOMPARE(intent3.timeConstraint.customStart.date().month(), 12); +} + +void tst_ChineseNLP::timeCustom_date() +{ + // "12月5日" → this year, Dec 5 + ParsedIntent intent; + m_parser->parse(QStringLiteral("12月5日的文档"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Custom); + QCOMPARE(intent.timeConstraint.customStart.date().month(), 12); + QCOMPARE(intent.timeConstraint.customStart.date().day(), 5); + QCOMPARE(intent.timeConstraint.customEnd.date().month(), 12); + QCOMPARE(intent.timeConstraint.customEnd.date().day(), 5); + QCOMPARE(intent.timeConstraint.customStart.date().year(), QDate::currentDate().year()); +} + +void tst_ChineseNLP::timeCustom_dateSpoken() +{ + // "3月8号" — spoken form with 号 + ParsedIntent intent; + m_parser->parse(QStringLiteral("3月8号的图片"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Custom); + QCOMPARE(intent.timeConstraint.customStart.date().month(), 3); + QCOMPARE(intent.timeConstraint.customStart.date().day(), 8); +} + +void tst_ChineseNLP::timeCustom_fullDate() +{ + // "2025年12月30日" — the specific example from requirements + ParsedIntent intent; + m_parser->parse(QStringLiteral("2025年12月30日的文档"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Custom); + QCOMPARE(intent.timeConstraint.customStart.date().year(), 2025); + QCOMPARE(intent.timeConstraint.customStart.date().month(), 12); + QCOMPARE(intent.timeConstraint.customStart.date().day(), 30); + QCOMPARE(intent.timeConstraint.customEnd.date().year(), 2025); + QCOMPARE(intent.timeConstraint.customEnd.date().month(), 12); + QCOMPARE(intent.timeConstraint.customEnd.date().day(), 30); + // Verify time boundaries + QCOMPARE(intent.timeConstraint.customStart.time().hour(), 0); + QCOMPARE(intent.timeConstraint.customEnd.time().hour(), 23); +} + +void tst_ChineseNLP::timeCustom_fullDate_separators() +{ + // "2025-12-05" — dash format + ParsedIntent intent1; + m_parser->parse(QStringLiteral("2025-12-05的文档"), intent1); + QCOMPARE(intent1.timeConstraint.kind, TimeConstraintKind::Custom); + QCOMPARE(intent1.timeConstraint.customStart.date().year(), 2025); + QCOMPARE(intent1.timeConstraint.customStart.date().month(), 12); + QCOMPARE(intent1.timeConstraint.customStart.date().day(), 5); + + // "2025/12/5" — slash format (no leading zero) + ParsedIntent intent2; + m_parser->parse(QStringLiteral("2025/12/5的文件"), intent2); + QCOMPARE(intent2.timeConstraint.kind, TimeConstraintKind::Custom); + QCOMPARE(intent2.timeConstraint.customStart.date().year(), 2025); + QCOMPARE(intent2.timeConstraint.customStart.date().month(), 12); + QCOMPARE(intent2.timeConstraint.customStart.date().day(), 5); + + // "2025.12.5" — dot format + ParsedIntent intent3; + m_parser->parse(QStringLiteral("2025.12.5的图片"), intent3); + QCOMPARE(intent3.timeConstraint.kind, TimeConstraintKind::Custom); + QCOMPARE(intent3.timeConstraint.customStart.date().year(), 2025); + QCOMPARE(intent3.timeConstraint.customStart.date().month(), 12); + QCOMPARE(intent3.timeConstraint.customStart.date().day(), 5); +} + +void tst_ChineseNLP::timeCustom_yesterday_variants_all() +{ + // "昨天下午" and "昨天晚上" — these are multi-char variants + ParsedIntent intent1; + m_parser->parse(QStringLiteral("昨天下午的图片"), intent1); + QCOMPARE(intent1.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent1.timeConstraint.preset, TimePreset::Yesterday); + + ParsedIntent intent2; + m_parser->parse(QStringLiteral("昨天晚上的视频"), intent2); + QCOMPARE(intent2.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent2.timeConstraint.preset, TimePreset::Yesterday); +} + +void tst_ChineseNLP::timeCustom_lastYear_extra() +{ + // "去年一整年" — not in current rules, but in requirements + // Current rules only have "去年|上一年". Test that "去年" works. + ParsedIntent intent; + m_parser->parse(QStringLiteral("去年的文档"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::LastYear); +} + +// ===== Filetype All-Synonyms Tests (from requirements) ===== + +void tst_ChineseNLP::fileType_document_general_allSynonyms() +{ + // Requirements 2.3.2.2.2: 文档, 报告, 文章, 方案, 文本, 资料, 笔记, 稿件 + const QStringList inputs = { + QStringLiteral("文档"), QStringLiteral("报告"), + QStringLiteral("文章"), QStringLiteral("方案"), QStringLiteral("文本"), + QStringLiteral("资料"), QStringLiteral("笔记"), QStringLiteral("稿件") + }; + const QStringList expectedExts = { "doc", "docx", "pdf", "txt", "wps", "rtf", "md", "odt" }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input, intent); + QVERIFY2(setEquals(intent.fileExtensions, expectedExts), + qPrintable(QStringLiteral("Failed for input: ") + input + + QStringLiteral(" got: ") + intent.fileExtensions.join(","))); + } +} + +void tst_ChineseNLP::fileType_spreadsheet_general_allSynonyms() +{ + // Requirements: 表格, 统计表, 报表, 名单, 数据表, 数据, 明细 + // NOTE: "数据" is excluded from rules due to high false-positive risk + const QStringList inputs = { + QStringLiteral("表格"), QStringLiteral("统计表"), QStringLiteral("报表"), + QStringLiteral("名单"), QStringLiteral("数据表"), QStringLiteral("明细") + }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input, intent); + QVERIFY2(intent.fileExtensions.contains("xls"), + qPrintable(QStringLiteral("Missing xls for: ") + input)); + QVERIFY2(intent.fileExtensions.contains("xlsx"), + qPrintable(QStringLiteral("Missing xlsx for: ") + input)); + QVERIFY2(intent.fileExtensions.contains("csv"), + qPrintable(QStringLiteral("Missing csv for: ") + input)); + } +} + +void tst_ChineseNLP::filetype_presentation_general_allSynonyms() +{ + // Requirements: 幻灯片, 演示文稿, 汇报, 课件, 宣讲 + const QStringList inputs = { + QStringLiteral("幻灯片"), QStringLiteral("演示文稿"), QStringLiteral("汇报"), + QStringLiteral("课件"), QStringLiteral("宣讲") + }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input, intent); + QVERIFY2(intent.fileExtensions.contains("ppt"), + qPrintable(QStringLiteral("Missing ppt for: ") + input)); + QVERIFY2(intent.fileExtensions.contains("pptx"), + qPrintable(QStringLiteral("Missing pptx for: ") + input)); + } +} + +void tst_ChineseNLP::fileType_image_allSynonyms() +{ + // Requirements: 图片, 照片, 截图, 图, 壁纸, 海报, 相片, 表情包 + const QStringList inputs = { + QStringLiteral("图片"), QStringLiteral("照片"), QStringLiteral("截图"), + QStringLiteral("图"), QStringLiteral("壁纸"), QStringLiteral("海报"), + QStringLiteral("相片"), QStringLiteral("表情包") + }; + const QStringList expectedExts = { "jpg", "jpeg", "png", "gif", "bmp", "webp", "svg" }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input, intent); + QVERIFY2(setEquals(intent.fileExtensions, expectedExts), + qPrintable(QStringLiteral("Failed for input: ") + input)); + } +} + +void tst_ChineseNLP::fileType_video_allSynonyms() +{ + // Requirements: 视频, 录像, 电影, 动画, 短片, 片子 + const QStringList inputs = { + QStringLiteral("视频"), QStringLiteral("录像"), QStringLiteral("电影"), + QStringLiteral("动画"), QStringLiteral("短片"), QStringLiteral("片子") + }; + const QStringList expectedExts = { "mp4", "avi", "mkv", "mov", "flv", "wmv", "webm" }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input, intent); + QVERIFY2(setEquals(intent.fileExtensions, expectedExts), + qPrintable(QStringLiteral("Failed for input: ") + input)); + } +} + +void tst_ChineseNLP::fileType_audio_allSynonyms() +{ + // Requirements: 音频, 音乐, 录音, 歌, 语音 + const QStringList inputs = { + QStringLiteral("音频"), QStringLiteral("音乐"), QStringLiteral("录音"), + QStringLiteral("歌"), QStringLiteral("语音") + }; + const QStringList expectedExts = { "mp3", "wav", "flac", "aac", "ogg", "m4a" }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input, intent); + QVERIFY2(setEquals(intent.fileExtensions, expectedExts), + qPrintable(QStringLiteral("Failed for input: ") + input)); + } +} + +void tst_ChineseNLP::fileType_archive_allSynonyms() +{ + // Requirements: 压缩包, 归档, 源码包, 打包文件, zip, rar + const QStringList inputs = { + QStringLiteral("压缩包"), QStringLiteral("归档"), QStringLiteral("源码包"), + QStringLiteral("打包文件"), QStringLiteral("zip"), QStringLiteral("rar") + }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input, intent); + QVERIFY2(intent.fileExtensions.contains("zip"), + qPrintable(QStringLiteral("Missing zip for: ") + input)); + } +} + +void tst_ChineseNLP::fileType_application_allSynonyms() +{ + // Requirements: 安装包, 软件, 应用, 脚本, 程序, 包 + // NOTE: "包" excluded from rules to avoid false positives with "表情包", "压缩包" + const QStringList inputs = { + QStringLiteral("安装包"), QStringLiteral("软件"), QStringLiteral("应用"), + QStringLiteral("脚本"), QStringLiteral("程序") + }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input, intent); + QVERIFY2(intent.fileExtensions.contains("deb"), + qPrintable(QStringLiteral("Missing deb for: ") + input)); + QVERIFY2(intent.fileExtensions.contains("sh"), + qPrintable(QStringLiteral("Missing sh for: ") + input)); + } +} + +void tst_ChineseNLP::fileType_design_source_allSynonyms() +{ + // Requirements: 源文件, 设计稿, psd, 矢量图, 工程文件 + const QStringList inputs = { + QStringLiteral("源文件"), QStringLiteral("设计稿"), QStringLiteral("矢量图"), + QStringLiteral("工程文件"), QStringLiteral("psd"), QStringLiteral("fig"), + QStringLiteral("sketch") + }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input, intent); + QVERIFY2(intent.fileExtensions.contains("psd"), + qPrintable(QStringLiteral("Missing psd for: ") + input)); + QVERIFY2(intent.fileExtensions.contains("ai"), + qPrintable(QStringLiteral("Missing ai for: ") + input)); + } +} + +// ===== Combined Time+Type Tests ===== + +void tst_ChineseNLP::combined_fullDateAndType() +{ + // Requirements example: "2025年12月30日的文档" + ParsedIntent intent; + m_parser->parse(QStringLiteral("2025年12月30日的文档"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Custom); + QCOMPARE(intent.timeConstraint.customStart.date().year(), 2025); + QCOMPARE(intent.timeConstraint.customStart.date().month(), 12); + QCOMPARE(intent.timeConstraint.customStart.date().day(), 30); + // "文档" matches filetype_document_general + QVERIFY(intent.fileExtensions.contains("doc")); + QVERIFY(intent.fileExtensions.contains("pdf")); + QVERIFY(intent.keywords.isEmpty()); +} + +void tst_ChineseNLP::combined_monthAndType() +{ + // "12月的图片" — month + image + ParsedIntent intent; + m_parser->parse(QStringLiteral("12月的图片"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Custom); + QCOMPARE(intent.timeConstraint.customStart.date().month(), 12); + QVERIFY(intent.fileExtensions.contains("jpg")); + QVERIFY(intent.fileExtensions.contains("png")); + QVERIFY(intent.keywords.isEmpty()); +} + +void tst_ChineseNLP::combined_yearAndType() +{ + // "2025年的视频" — year + video + ParsedIntent intent; + m_parser->parse(QStringLiteral("2025年的视频"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Custom); + QCOMPARE(intent.timeConstraint.customStart.date().year(), 2025); + QCOMPARE(intent.timeConstraint.customEnd.date().year(), 2025); + QVERIFY(intent.fileExtensions.contains("mp4")); + QVERIFY(intent.fileExtensions.contains("avi")); +} + +// ===== Size Tests ===== + +void tst_ChineseNLP::size_fuzzy_large() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("大文件"), intent); + QVERIFY(intent.sizeConstraint.isValid()); + QCOMPARE(intent.sizeConstraint.minSize, 524288000LL); // 500MB + QCOMPARE(intent.sizeConstraint.maxSize, 0LL); // no upper bound +} + +void tst_ChineseNLP::size_fuzzy_large_synonyms() +{ + const QStringList inputs = { QStringLiteral("很大的"), QStringLiteral("占空间的"), + QStringLiteral("几个G的") }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input + QStringLiteral("的图片"), intent); + QVERIFY2(intent.sizeConstraint.isValid(), + qPrintable(QStringLiteral("Size not valid for: ") + input)); + } +} + +void tst_ChineseNLP::size_fuzzy_small() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("小文件"), intent); + QVERIFY(intent.sizeConstraint.isValid()); + QCOMPARE(intent.sizeConstraint.minSize, 0LL); + QCOMPARE(intent.sizeConstraint.maxSize, 1048576LL); // 1MB + QCOMPARE(intent.sizeConstraint.includeUpper, false); +} + +void tst_ChineseNLP::size_dynamic_min() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("大于500M的文档"), intent); + QVERIFY(intent.sizeConstraint.isValid()); + QCOMPARE(intent.sizeConstraint.minSize, 524288000LL); // 500MB + QVERIFY(intent.sizeConstraint.includeLower); +} + +void tst_ChineseNLP::size_dynamic_max() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("小于100K的文件"), intent); + QVERIFY(intent.sizeConstraint.isValid()); + QCOMPARE(intent.sizeConstraint.maxSize, 102400LL); // 100KB + QCOMPARE(intent.sizeConstraint.minSize, 0LL); +} + +void tst_ChineseNLP::size_dynamic_between() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("1M-10M的文件"), intent); + QVERIFY(intent.sizeConstraint.isValid()); + QCOMPARE(intent.sizeConstraint.minSize, 1048576LL); // 1MB + QCOMPARE(intent.sizeConstraint.maxSize, 10485760LL); // 10MB +} + +void tst_ChineseNLP::size_chineseUnits_min() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("大于100兆的文件"), intent); + QVERIFY(intent.sizeConstraint.isValid()); + QCOMPARE(intent.sizeConstraint.minSize, 104857600LL); // 100MB + QVERIFY(intent.sizeConstraint.includeLower); +} + +void tst_ChineseNLP::size_chineseUnits_max() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("小于50兆的图片"), intent); + QVERIFY(intent.sizeConstraint.isValid()); + QCOMPARE(intent.sizeConstraint.maxSize, 52428800LL); // 50MB +} + +void tst_ChineseNLP::size_chineseUnits_range() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("找下大小在1兆到10兆的文件"), intent); + QVERIFY(intent.sizeConstraint.isValid()); + QVERIFY(intent.keywords.isEmpty()); + QCOMPARE(intent.sizeConstraint.minSize, 1048576LL); // 1MB + QCOMPARE(intent.sizeConstraint.maxSize, 10485760LL); // 10MB +} + +void tst_ChineseNLP::size_noUnit_bytes() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("小于1024的文件"), intent); + QVERIFY(intent.sizeConstraint.isValid()); + QCOMPARE(intent.sizeConstraint.maxSize, 1024LL); // raw bytes +} + +void tst_ChineseNLP::size_combined_withTime() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("今天的大文件"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::Today); + QVERIFY(intent.sizeConstraint.isValid()); + QCOMPARE(intent.sizeConstraint.minSize, 524288000LL); +} + +void tst_ChineseNLP::size_combined_withType() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("大文件 pdf"), intent); + QVERIFY(intent.sizeConstraint.isValid()); + QVERIFY(intent.fileExtensions.contains("pdf")); +} + +void tst_ChineseNLP::size_combined_full() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("昨天大于100M的图片和视频"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::Yesterday); + QVERIFY(intent.sizeConstraint.isValid()); + QCOMPARE(intent.sizeConstraint.minSize, 104857600LL); // 100MB + QVERIFY(intent.fileExtensions.contains("jpg")); + QVERIFY(intent.fileExtensions.contains("mp4")); +} + +void tst_ChineseNLP::size_suffix_min() +{ + // Suffix-only min: "10M以上" without prefix keyword + ParsedIntent intent; + m_parser->parse(QStringLiteral("10M以上的文件"), intent); + QVERIFY(intent.sizeConstraint.isValid()); + QCOMPARE(intent.sizeConstraint.minSize, 10485760LL); // 10MB + QCOMPARE(intent.sizeConstraint.maxSize, 0LL); + + // "1G以上" — GB unit + ParsedIntent intent2; + m_parser->parse(QStringLiteral("1G以上的图片"), intent2); + QVERIFY(intent2.sizeConstraint.isValid()); + QCOMPARE(intent2.sizeConstraint.minSize, 1073741824LL); // 1GB + + // "500K以上" — KB unit + ParsedIntent intent3; + m_parser->parse(QStringLiteral("500K以上的文档"), intent3); + QVERIFY(intent3.sizeConstraint.isValid()); + QCOMPARE(intent3.sizeConstraint.minSize, 512000LL); // 500KB +} + +void tst_ChineseNLP::size_suffix_max() +{ + // Suffix-only max: "10M以内" + ParsedIntent intent; + m_parser->parse(QStringLiteral("10M以内的文档"), intent); + QVERIFY(intent.sizeConstraint.isValid()); + QCOMPARE(intent.sizeConstraint.minSize, 0LL); + QCOMPARE(intent.sizeConstraint.maxSize, 10485760LL); // 10MB + + // "1G以下" — "以下" variant + ParsedIntent intent2; + m_parser->parse(QStringLiteral("1G以下的视频"), intent2); + QVERIFY(intent2.sizeConstraint.isValid()); + QCOMPARE(intent2.sizeConstraint.maxSize, 1073741824LL); // 1GB +} + +void tst_ChineseNLP::size_suffix_combined() +{ + // The originally reported bug: "10M以上的表格" + // Should parse both size constraint and filetype + ParsedIntent intent; + m_parser->parse(QStringLiteral("10M以上的表格"), intent); + QVERIFY(intent.sizeConstraint.isValid()); + QCOMPARE(intent.sizeConstraint.minSize, 10485760LL); // 10MB + QVERIFY(intent.fileExtensions.contains("xls")); + QVERIFY(intent.fileExtensions.contains("xlsx")); + QVERIFY(intent.fileExtensions.contains("csv")); + + // "5G以内的压缩包" — size + filetype + ParsedIntent intent2; + m_parser->parse(QStringLiteral("5G以内的压缩包"), intent2); + QVERIFY(intent2.sizeConstraint.isValid()); + QCOMPARE(intent2.sizeConstraint.maxSize, 5368709120LL); // 5GB + QVERIFY(intent2.fileExtensions.contains("zip")); + QVERIFY(intent2.fileExtensions.contains("rar")); +} + +void tst_ChineseNLP::size_suffix_chineseUnits() +{ + // Chinese unit names with suffix: "100兆以上" + ParsedIntent intent; + m_parser->parse(QStringLiteral("100兆以上的文件"), intent); + QVERIFY(intent.sizeConstraint.isValid()); + QCOMPARE(intent.sizeConstraint.minSize, 104857600LL); // 100MB + + // "50千以内" + ParsedIntent intent2; + m_parser->parse(QStringLiteral("50千以内的图片"), intent2); + QVERIFY(intent2.sizeConstraint.isValid()); + QCOMPARE(intent2.sizeConstraint.maxSize, 51200LL); // 50KB +} + +// ===== Relative Time Tests ===== + +void tst_ChineseNLP::timeRelative_justNow() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("刚刚的图片"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Relative); + // End should be very close to NOW (within 2 seconds) + const qint64 endDelta = qAbs(intent.timeConstraint.customEnd.secsTo(QDateTime::currentDateTime())); + QVERIFY2(endDelta < 2, "Relative end should be close to NOW"); + // Start should be ~2 hours ago + const qint64 startDelta = qAbs(intent.timeConstraint.customStart.secsTo(QDateTime::currentDateTime().addSecs(-7200))); + QVERIFY2(startDelta < 2, "Relative start should be ~2h ago"); + QVERIFY(intent.fileExtensions.contains("jpg")); +} + +void tst_ChineseNLP::timeRelative_justNow_synonyms() +{ + const QStringList inputs = { QStringLiteral("刚才"), QStringLiteral("刚"), + QStringLiteral("这会儿") }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input + QStringLiteral("的文档"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Relative); + } +} + +void tst_ChineseNLP::timeRelative_recentDays() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("最近的图片"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Relative); + // End = NOW, Start = NOW - 3 days + const qint64 endDelta = qAbs(intent.timeConstraint.customEnd.secsTo(QDateTime::currentDateTime())); + QVERIFY2(endDelta < 2, "Recent days end should be NOW"); + const qint64 startDelta = qAbs(intent.timeConstraint.customStart.secsTo(QDateTime::currentDateTime().addSecs(-259200))); + QVERIFY2(startDelta < 2, "Recent days start should be ~3 days ago"); +} + +void tst_ChineseNLP::timeRelative_recentDays_synonyms() +{ + const QStringList inputs = { QStringLiteral("这几天"), QStringLiteral("近期"), + QStringLiteral("这阵子") }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input + QStringLiteral("的文件"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Relative); + } +} + +void tst_ChineseNLP::timeRelative_pastFewDays() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("前几天的文档"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Relative); + // End = NOW - 3 days, Start = NOW - 7 days + const qint64 endDelta = qAbs(intent.timeConstraint.customEnd.secsTo(QDateTime::currentDateTime().addSecs(-259200))); + QVERIFY2(endDelta < 2, "Past few days end should be ~3 days ago"); + const qint64 startDelta = qAbs(intent.timeConstraint.customStart.secsTo(QDateTime::currentDateTime().addSecs(-604800))); + QVERIFY2(startDelta < 2, "Past few days start should be ~7 days ago"); +} + +void tst_ChineseNLP::timeRelative_pastFewDays_synonyms() +{ + const QStringList inputs = { QStringLiteral("之前几天"), QStringLiteral("那些天") }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input + QStringLiteral("的图片"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Relative); + } +} + +void tst_ChineseNLP::timeRelative_aWhileAgo() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("之前的文档"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Relative); + // End = NOW - 30 days + const qint64 endDelta = qAbs(intent.timeConstraint.customEnd.secsTo(QDateTime::currentDateTime().addSecs(-2592000))); + QVERIFY2(endDelta < 2, "A while ago end should be ~30 days ago"); + // Start should be epoch + QCOMPARE(intent.timeConstraint.customStart, QDateTime::fromMSecsSinceEpoch(0)); +} + +void tst_ChineseNLP::timeRelative_aWhileAgo_synonyms() +{ + const QStringList inputs = { QStringLiteral("早些时候"), QStringLiteral("以前") }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input + QStringLiteral("的文件"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Relative); + } +} + +void tst_ChineseNLP::timeRelative_priority_vs_preset() +{ + // When both preset and relative could match, preset should win (higher priority) + // "今天之前" — "今天" matches time_today (priority 200), "之前" matches time_a_while_ago (priority 80) + ParsedIntent intent; + m_parser->parse(QStringLiteral("今天之前的文档"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::Today); +} + +// ===== Dynamic Relative Time Tests ===== + +void tst_ChineseNLP::timeDynamic_recent_days() +{ + // "最近3天" — dynamic relative, should consume all 4 chars + ParsedIntent intent; + m_parser->parse(QStringLiteral("最近3天"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Relative); + QCOMPARE(intent.timeConstraint.relativeValue, 3); + QCOMPARE(intent.timeConstraint.relativeUnit, TimeUnit::Days); + // Verify time range: ~3 days ago to now + const qint64 startDelta = qAbs(intent.timeConstraint.customStart.secsTo(QDateTime::currentDateTime())); + QVERIFY2(startDelta >= 259000 && startDelta <= 259300, "Start should be ~3 days ago"); + + // "近3天" — shorter variant + ParsedIntent intent2; + m_parser->parse(QStringLiteral("近3天的图片"), intent2); + QCOMPARE(intent2.timeConstraint.kind, TimeConstraintKind::Relative); + QCOMPARE(intent2.timeConstraint.relativeValue, 3); + QCOMPARE(intent2.timeConstraint.relativeUnit, TimeUnit::Days); + QVERIFY(intent2.fileExtensions.contains("jpg")); + + // "过去7天" — variant + ParsedIntent intent3; + m_parser->parse(QStringLiteral("过去7天"), intent3); + QCOMPARE(intent3.timeConstraint.kind, TimeConstraintKind::Relative); + QCOMPARE(intent3.timeConstraint.relativeValue, 7); + + // "前3天" — variant + ParsedIntent intent4; + m_parser->parse(QStringLiteral("前3天的文档"), intent4); + QCOMPARE(intent4.timeConstraint.kind, TimeConstraintKind::Relative); + QCOMPARE(intent4.timeConstraint.relativeValue, 3); +} + +void tst_ChineseNLP::timeDynamic_recent_hours() +{ + // "最近2小时" — dynamic relative hours + ParsedIntent intent; + m_parser->parse(QStringLiteral("最近2小时的文件"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Relative); + QCOMPARE(intent.timeConstraint.relativeValue, 2); + QCOMPARE(intent.timeConstraint.relativeUnit, TimeUnit::Hours); + + // "近1小时" — shorter variant + ParsedIntent intent2; + m_parser->parse(QStringLiteral("近1小时"), intent2); + QCOMPARE(intent2.timeConstraint.kind, TimeConstraintKind::Relative); + QCOMPARE(intent2.timeConstraint.relativeValue, 1); +} + +void tst_ChineseNLP::timeDynamic_recent_weeks() +{ + // "最近2周" — dynamic relative weeks + ParsedIntent intent; + m_parser->parse(QStringLiteral("最近2周的文档"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Relative); + QCOMPARE(intent.timeConstraint.relativeValue, 2); + QCOMPARE(intent.timeConstraint.relativeUnit, TimeUnit::Weeks); + // ~14 days + const qint64 startDelta = qAbs(intent.timeConstraint.customStart.secsTo(QDateTime::currentDateTime())); + QVERIFY2(startDelta >= 1209000 && startDelta <= 1210000, "Start should be ~2 weeks ago"); +} + +void tst_ChineseNLP::timeDynamic_recent_months() +{ + // "最近3个月" — dynamic relative months + ParsedIntent intent; + m_parser->parse(QStringLiteral("最近3个月的图片"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Relative); + QCOMPARE(intent.timeConstraint.relativeValue, 3); + QCOMPARE(intent.timeConstraint.relativeUnit, TimeUnit::Months); + QVERIFY(intent.fileExtensions.contains("jpg")); + + // "近1月" — shorter variant without "个" + ParsedIntent intent2; + m_parser->parse(QStringLiteral("近1月的文档"), intent2); + QCOMPARE(intent2.timeConstraint.kind, TimeConstraintKind::Relative); + QCOMPARE(intent2.timeConstraint.relativeValue, 1); +} + +void tst_ChineseNLP::timeDynamic_combined_noKeyword() +{ + // The originally reported bug: "最近3天的表格" + // Should parse as: time(recent 3 days) + filetype(spreadsheet) — NO keyword + ParsedIntent intent; + m_parser->parse(QStringLiteral("最近3天的表格"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Relative); + QCOMPARE(intent.timeConstraint.relativeValue, 3); + QCOMPARE(intent.timeConstraint.relativeUnit, TimeUnit::Days); + // Filetype should be matched + QVERIFY(intent.fileExtensions.contains("xls")); + QVERIFY(intent.fileExtensions.contains("xlsx")); + QVERIFY(intent.fileExtensions.contains("csv")); + // No keywords — "3天" is consumed as part of the time expression + QVERIFY2(intent.keywords.isEmpty(), + qPrintable(QStringLiteral("Expected no keywords, got: ") + intent.keywords.join(","))); + + // "过去7天的文档" — same pattern + ParsedIntent intent2; + m_parser->parse(QStringLiteral("过去7天的文档"), intent2); + QCOMPARE(intent2.timeConstraint.kind, TimeConstraintKind::Relative); + QCOMPARE(intent2.timeConstraint.relativeValue, 7); + QVERIFY(!intent2.fileExtensions.isEmpty()); + QVERIFY2(intent2.keywords.isEmpty(), + qPrintable(QStringLiteral("Expected no keywords, got: ") + intent2.keywords.join(","))); +} + +void tst_ChineseNLP::timeDynamic_combined_withType() +{ + // "最近3天的图片和视频" — time + multiple filetypes, no keyword + ParsedIntent intent; + m_parser->parse(QStringLiteral("最近3天的图片和视频"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Relative); + QCOMPARE(intent.timeConstraint.relativeValue, 3); + QVERIFY(intent.fileExtensions.contains("jpg")); + QVERIFY(intent.fileExtensions.contains("mp4")); + QVERIFY2(intent.keywords.isEmpty(), + qPrintable(QStringLiteral("Expected no keywords, got: ") + intent.keywords.join(","))); + + // "近2个月的压缩包" — time + filetype + ParsedIntent intent2; + m_parser->parse(QStringLiteral("近2个月的压缩包"), intent2); + QCOMPARE(intent2.timeConstraint.kind, TimeConstraintKind::Relative); + QCOMPARE(intent2.timeConstraint.relativeValue, 2); + QCOMPARE(intent2.timeConstraint.relativeUnit, TimeUnit::Months); + QVERIFY(intent2.fileExtensions.contains("zip")); + QVERIFY2(intent2.keywords.isEmpty(), + qPrintable(QStringLiteral("Expected no keywords, got: ") + intent2.keywords.join(","))); +} + +void tst_ChineseNLP::timeDynamic_chineseNumerals() +{ + // "最近一周的图片" — 一 = 1 + ParsedIntent intent; + m_parser->parse(QStringLiteral("最近一周的图片"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Relative); + QCOMPARE(intent.timeConstraint.relativeValue, 1); + QCOMPARE(intent.timeConstraint.relativeUnit, TimeUnit::Weeks); + QVERIFY(intent.fileExtensions.contains("jpg")); + QVERIFY(intent.keywords.isEmpty()); + + // "最近两周的表格" — 两 = 2 + ParsedIntent intent2; + m_parser->parse(QStringLiteral("最近两周的表格"), intent2); + QCOMPARE(intent2.timeConstraint.kind, TimeConstraintKind::Relative); + QCOMPARE(intent2.timeConstraint.relativeValue, 2); + QCOMPARE(intent2.timeConstraint.relativeUnit, TimeUnit::Weeks); + QVERIFY(intent2.fileExtensions.contains("xls")); + QVERIFY(intent2.keywords.isEmpty()); + + // "最近三天的文档" — 三 = 3 + ParsedIntent intent3; + m_parser->parse(QStringLiteral("最近三天的文档"), intent3); + QCOMPARE(intent3.timeConstraint.kind, TimeConstraintKind::Relative); + QCOMPARE(intent3.timeConstraint.relativeValue, 3); + QCOMPARE(intent3.timeConstraint.relativeUnit, TimeUnit::Days); + QVERIFY(!intent3.fileExtensions.isEmpty()); + QVERIFY(intent3.keywords.isEmpty()); + + // "近五个月的视频" — 五 = 5 + ParsedIntent intent4; + m_parser->parse(QStringLiteral("近五个月的视频"), intent4); + QCOMPARE(intent4.timeConstraint.kind, TimeConstraintKind::Relative); + QCOMPARE(intent4.timeConstraint.relativeValue, 5); + QCOMPARE(intent4.timeConstraint.relativeUnit, TimeUnit::Months); + QVERIFY(intent4.fileExtensions.contains("mp4")); + QVERIFY(intent4.keywords.isEmpty()); + + // "过去七天" — 七 = 7 + ParsedIntent intent5; + m_parser->parse(QStringLiteral("过去七天"), intent5); + QCOMPARE(intent5.timeConstraint.kind, TimeConstraintKind::Relative); + QCOMPARE(intent5.timeConstraint.relativeValue, 7); + + // Mixed: Arabic + Chinese should still work + // "最近3天" already tested above +} + +// ===== Action Behavior Tests ===== + +void tst_ChineseNLP::action_create_birthTime() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("新建的图片"), intent); + QCOMPARE(intent.timeConstraint.timeField, TimeField::BirthTime); + // Action word should be consumed + bool actionConsumed = false; + for (const MatchSpan &span : intent.consumedSpans) { + if (span.ruleId == "action_create") { + actionConsumed = true; + break; + } + } + QVERIFY2(actionConsumed, "action_create should produce a consumed span"); +} + +void tst_ChineseNLP::action_create_synonyms() +{ + const QStringList inputs = { QStringLiteral("创建的文档"), QStringLiteral("存下来的图片"), + QStringLiteral("保存的文件"), QStringLiteral("新加的视频") }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input, intent); + QCOMPARE(intent.timeConstraint.timeField, TimeField::BirthTime); + } +} + +void tst_ChineseNLP::action_modify_modifyTime() +{ + ParsedIntent intent; + m_parser->parse(QStringLiteral("修改过的图片"), intent); + QCOMPARE(intent.timeConstraint.timeField, TimeField::ModifyTime); + // Action word should be consumed + bool actionConsumed = false; + for (const MatchSpan &span : intent.consumedSpans) { + if (span.ruleId == "action_modify") { + actionConsumed = true; + break; + } + } + QVERIFY2(actionConsumed, "action_modify should produce a consumed span"); +} + +void tst_ChineseNLP::action_modify_synonyms() +{ + const QStringList inputs = { QStringLiteral("编辑过的文档"), QStringLiteral("改过的文件"), + QStringLiteral("写过的图片"), QStringLiteral("更新的视频") }; + for (const QString &input : inputs) { + ParsedIntent intent; + m_parser->parse(input, intent); + QCOMPARE(intent.timeConstraint.timeField, TimeField::ModifyTime); + } +} + +void tst_ChineseNLP::action_default_unspecified() +{ + // Without action words, timeField should remain Unspecified + ParsedIntent intent; + m_parser->parse(QStringLiteral("今天的图片"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::Today); + QCOMPARE(intent.timeConstraint.timeField, TimeField::Unspecified); +} + +void tst_ChineseNLP::action_combined_withTime_create() +{ + // "新建的今天的文档" — action_create + time today + ParsedIntent intent; + m_parser->parse(QStringLiteral("新建的今天的文档"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::Today); + QCOMPARE(intent.timeConstraint.timeField, TimeField::BirthTime); + // Both time and action spans consumed + int consumedCount = intent.consumedSpans.size(); + QVERIFY2(consumedCount >= 2, + qPrintable(QStringLiteral("Expected >=2 consumed spans, got ") + QString::number(consumedCount))); +} + +void tst_ChineseNLP::action_combined_withTime_modify() +{ + // "昨天修改过的图片" — time yesterday + action_modify + ParsedIntent intent; + m_parser->parse(QStringLiteral("昨天修改过的图片"), intent); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::Yesterday); + QCOMPARE(intent.timeConstraint.timeField, TimeField::ModifyTime); + QVERIFY(intent.fileExtensions.contains("jpg")); +} + +// ===== Location Tests ===== + +void tst_ChineseNLP::location_desktop() +{ + // "桌面上的文档" → location(桌面) + filetype(文档) + const QString desktopPath = QStandardPaths::writableLocation(QStandardPaths::DesktopLocation); + ParsedIntent intent; + m_parser->parse(QStringLiteral("桌面上的文档"), intent); + QCOMPARE(intent.searchDirectories.size(), 1); + QCOMPARE(intent.searchDirectories.first(), desktopPath); + QVERIFY(!intent.fileExtensions.isEmpty()); // document type extensions + QVERIFY(intent.keywords.isEmpty()); +} + +void tst_ChineseNLP::location_download() +{ + // "下载里的图片" → location(下载) + filetype(图片) + const QString downloadPath = QStandardPaths::writableLocation(QStandardPaths::DownloadLocation); + ParsedIntent intent; + m_parser->parse(QStringLiteral("下载里的图片"), intent); + QCOMPARE(intent.searchDirectories.size(), 1); + QCOMPARE(intent.searchDirectories.first(), downloadPath); + QVERIFY(intent.fileExtensions.contains("jpg")); + QVERIFY(intent.keywords.isEmpty()); +} + +void tst_ChineseNLP::location_documentsDir() +{ + // "文档目录里的报告" → location(文档目录) + keyword(报告) + const QString docsPath = QStandardPaths::writableLocation(QStandardPaths::DocumentsLocation); + ParsedIntent intent; + m_parser->parse(QStringLiteral("文档目录里的报告"), intent); + QCOMPARE(intent.searchDirectories.size(), 1); + QCOMPARE(intent.searchDirectories.first(), docsPath); +} + +void tst_ChineseNLP::location_picturesDir() +{ + // "图片文件夹里的照片" → location(图片文件夹) + filetype(图片) + const QString picsPath = QStandardPaths::writableLocation(QStandardPaths::PicturesLocation); + ParsedIntent intent; + m_parser->parse(QStringLiteral("图片文件夹里的照片"), intent); + QCOMPARE(intent.searchDirectories.size(), 1); + QCOMPARE(intent.searchDirectories.first(), picsPath); + QVERIFY(intent.fileExtensions.contains("jpg")); +} + +void tst_ChineseNLP::location_musicDir() +{ + // "音乐目录里的歌曲" → location(音乐目录) + const QString musicPath = QStandardPaths::writableLocation(QStandardPaths::MusicLocation); + ParsedIntent intent; + m_parser->parse(QStringLiteral("音乐目录里的歌曲"), intent); + QCOMPARE(intent.searchDirectories.size(), 1); + QCOMPARE(intent.searchDirectories.first(), musicPath); +} + +void tst_ChineseNLP::location_videosDir() +{ + // "视频目录下的电影" → location(视频目录) + const QString videosPath = QStandardPaths::writableLocation(QStandardPaths::MoviesLocation); + ParsedIntent intent; + m_parser->parse(QStringLiteral("视频目录下的电影"), intent); + QCOMPARE(intent.searchDirectories.size(), 1); + QCOMPARE(intent.searchDirectories.first(), videosPath); +} + +void tst_ChineseNLP::location_trash() +{ + // "回收站里的文件" → location(trash) + includeHidden + filetype(文件=文档类) + const QString trashPath = QDir::homePath() + "/.local/share/Trash/files"; + ParsedIntent intent; + m_parser->parse(QStringLiteral("回收站里的文档"), intent); + QCOMPARE(intent.searchDirectories.size(), 1); + QCOMPARE(intent.searchDirectories.first(), trashPath); + QVERIFY(intent.includeHidden); + // "文件" matches filetype_document_general, so it's a filetype not a keyword + QVERIFY(!intent.fileExtensions.isEmpty()); + QVERIFY(intent.fileExtensions.contains("doc")); + QVERIFY(intent.keywords.isEmpty()); +} + +void tst_ChineseNLP::location_deleted() +{ + // "昨天删除的音乐" → location(trash) + time(yesterday) + filetype(音乐) + const QString trashPath = QDir::homePath() + "/.local/share/Trash/files"; + ParsedIntent intent; + m_parser->parse(QStringLiteral("昨天删除的音乐"), intent); + QCOMPARE(intent.searchDirectories.size(), 1); + QCOMPARE(intent.searchDirectories.first(), trashPath); + QVERIFY(intent.includeHidden); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::Yesterday); + QVERIFY(intent.fileExtensions.contains("mp3")); +} + +void tst_ChineseNLP::location_noLocation() +{ + // "今天的文档" → no location, default behavior unchanged + ParsedIntent intent; + m_parser->parse(QStringLiteral("今天的文档"), intent); + QVERIFY(intent.searchDirectories.isEmpty()); + QVERIFY(!intent.includeHidden); + QCOMPARE(intent.timeConstraint.kind, TimeConstraintKind::Preset); + QCOMPARE(intent.timeConstraint.preset, TimePreset::Today); +} + +void tst_ChineseNLP::location_desktopAndDownload() +{ + // "桌面和下载的图片" → location(桌面,下载) + filetype(图片) + const QString desktopPath = QStandardPaths::writableLocation(QStandardPaths::DesktopLocation); + const QString downloadPath = QStandardPaths::writableLocation(QStandardPaths::DownloadLocation); + ParsedIntent intent; + m_parser->parse(QStringLiteral("桌面和下载的图片"), intent); + QCOMPARE(intent.searchDirectories.size(), 2); + QVERIFY(intent.searchDirectories.contains(desktopPath)); + QVERIFY(intent.searchDirectories.contains(downloadPath)); + QVERIFY(intent.fileExtensions.contains("jpg")); +} + +QObject *create_tst_ChineseNLP() +{ + return new tst_ChineseNLP(); +} + +#include "tst_chinese_nlp.moc" diff --git a/autotests/dfm-search-tests/tst_content_retriever.cpp b/autotests/dfm-search-tests/tst_content_retriever.cpp new file mode 100644 index 00000000..22a50199 --- /dev/null +++ b/autotests/dfm-search-tests/tst_content_retriever.cpp @@ -0,0 +1,215 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +using namespace dfmsearch; +using namespace Lucene; + +namespace { + +void addStoredDocument(const IndexWriterPtr &writer, + SearchType type, + const QString &path, + const QString &filename, + const QString &content) +{ + DocumentPtr doc = newLucene(); + if (type == SearchType::Ocr) { + doc->add(newLucene(LuceneFieldNames::OcrText::kPath, path.toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::OcrText::kFilename, filename.toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::OcrText::kOcrContents, content.toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + } else { + doc->add(newLucene(LuceneFieldNames::Content::kPath, path.toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::Content::kFilename, filename.toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::Content::kContents, content.toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + } + writer->addDocument(doc); +} + +void createIndex(const QString &indexDir, SearchType type) +{ + QDir().mkpath(indexDir); + IndexWriterPtr writer = newLucene( + FSDirectory::open(indexDir.toStdWString()), + newLucene(), + true, + IndexWriter::MaxFieldLengthLIMITED); + + if (type == SearchType::Content) { + addStoredDocument(writer, type, + "/tmp/doc-a.txt", + "doc-a.txt", + "hello world from content index"); + addStoredDocument(writer, type, + "/tmp/doc-b.txt", + "doc-b.txt", + "meeting notes and budget data"); + } else { + addStoredDocument(writer, type, + "/tmp/img-a.png", + "img-a.png", + "screenshot text from OCR"); + } + + writer->close(); +} + +} // namespace + +class tst_ContentRetriever : public QObject +{ + Q_OBJECT + +private Q_SLOTS: + void fetchContent_single(); + void fetchContent_batch(); + void fetchHighlight_usesTemporaryIndex(); + void concurrentFetch_sharedRetriever(); +}; + +void tst_ContentRetriever::fetchContent_single() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString contentIndexDir = tempDir.path() + "/content-index"; + const QString ocrIndexDir = tempDir.path() + "/ocr-index"; + createIndex(contentIndexDir, SearchType::Content); + createIndex(ocrIndexDir, SearchType::Ocr); + + ContentRetriever retriever; + retriever.setIndexDirectory(SearchType::Content, contentIndexDir); + retriever.setIndexDirectory(SearchType::Ocr, ocrIndexDir); + + QCOMPARE(retriever.fetchContent("/tmp/doc-a.txt", SearchType::Content), + QString("hello world from content index")); + QCOMPARE(retriever.fetchContent("/tmp/img-a.png", SearchType::Ocr), + QString("screenshot text from OCR")); + QVERIFY(retriever.fetchContent("/tmp/missing.txt", SearchType::Content).isEmpty()); +} + +void tst_ContentRetriever::fetchContent_batch() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString contentIndexDir = tempDir.path() + "/content-index"; + createIndex(contentIndexDir, SearchType::Content); + + ContentRetriever retriever; + retriever.setIndexDirectory(SearchType::Content, contentIndexDir); + const QMap results = retriever.fetchContents( + { "/tmp/doc-a.txt", "/tmp/doc-b.txt", "/tmp/missing.txt" }, + SearchType::Content); + + QCOMPARE(results.value("/tmp/doc-a.txt"), QString("hello world from content index")); + QCOMPARE(results.value("/tmp/doc-b.txt"), QString("meeting notes and budget data")); + QVERIFY(results.contains("/tmp/missing.txt")); + QVERIFY(results.value("/tmp/missing.txt").isEmpty()); +} + +void tst_ContentRetriever::fetchHighlight_usesTemporaryIndex() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString contentIndexDir = tempDir.path() + "/content-index"; + createIndex(contentIndexDir, SearchType::Content); + + ContentRetriever retriever; + retriever.setIndexDirectory(SearchType::Content, contentIndexDir); + HighlightOptions options; + options.maxPreviewLength = 80; + + const QString snippet = retriever.fetchHighlight("/tmp/doc-b.txt", + "budget", + SearchType::Content, + options); + QVERIFY(snippet.contains("budget", Qt::CaseInsensitive)); +} + +void tst_ContentRetriever::concurrentFetch_sharedRetriever() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString contentIndexDir = tempDir.path() + "/content-index"; + const QString ocrIndexDir = tempDir.path() + "/ocr-index"; + createIndex(contentIndexDir, SearchType::Content); + createIndex(ocrIndexDir, SearchType::Ocr); + + ContentRetriever retriever; + retriever.setIndexDirectory(SearchType::Content, contentIndexDir); + retriever.setIndexDirectory(SearchType::Ocr, ocrIndexDir); + + HighlightOptions options; + options.maxPreviewLength = 80; + + std::atomic_bool failed { false }; + std::vector> tasks; + tasks.reserve(8); + + for (int worker = 0; worker < 8; ++worker) { + tasks.emplace_back(std::async(std::launch::async, [&retriever, &options, &failed]() { + for (int i = 0; i < 50; ++i) { + if (retriever.fetchContent("/tmp/doc-a.txt", SearchType::Content) + != QString("hello world from content index")) { + failed.store(true); + return; + } + + if (retriever.fetchContent("/tmp/img-a.png", SearchType::Ocr) + != QString("screenshot text from OCR")) { + failed.store(true); + return; + } + + const QString snippet = retriever.fetchHighlight("/tmp/doc-b.txt", + "budget", + SearchType::Content, + options); + if (!snippet.contains("budget", Qt::CaseInsensitive)) { + failed.store(true); + return; + } + } + })); + } + + for (auto &task : tasks) { + task.get(); + } + + QVERIFY(!failed.load()); +} + +QObject *create_tst_ContentRetriever() +{ + return new tst_ContentRetriever(); +} + +#include "tst_content_retriever.moc" diff --git a/autotests/dfm-search-tests/tst_content_search_engine.cpp b/autotests/dfm-search-tests/tst_content_search_engine.cpp new file mode 100644 index 00000000..7eb04fb7 --- /dev/null +++ b/autotests/dfm-search-tests/tst_content_search_engine.cpp @@ -0,0 +1,495 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +using namespace DFMSEARCH; +using namespace Lucene; + +namespace { + +struct TestDocument +{ + QString path; + QString filename; + QString content; + QString ancestorPath; + QString hidden = "N"; + qint64 modifyTime = 1710000000; + qint64 birthTime = 1700000000; + qint64 fileSize = 1024; +}; + +DocumentPtr buildDocument(const TestDocument &docData) +{ + DocumentPtr doc = newLucene(); + doc->add(newLucene(LuceneFieldNames::Content::kPath, docData.path.toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::Content::kFilename, docData.filename.toStdWString(), + Field::STORE_YES, Field::INDEX_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::Content::kContents, docData.content.toStdWString(), + Field::STORE_YES, Field::INDEX_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::Content::kAncestorPaths, docData.ancestorPath.toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::Content::kIsHidden, docData.hidden.toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::Content::kModifyTime, QString::number(docData.modifyTime).toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::Content::kBirthTime, QString::number(docData.birthTime).toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::Content::kFileSize, QString::number(docData.fileSize).toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + return doc; +} + +DocumentPtr buildOcrDocument(const TestDocument &docData) +{ + DocumentPtr doc = newLucene(); + doc->add(newLucene(LuceneFieldNames::OcrText::kPath, docData.path.toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::OcrText::kFilename, docData.filename.toStdWString(), + Field::STORE_YES, Field::INDEX_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::OcrText::kOcrContents, docData.content.toStdWString(), + Field::STORE_YES, Field::INDEX_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::OcrText::kAncestorPaths, docData.ancestorPath.toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::OcrText::kIsHidden, docData.hidden.toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::OcrText::kModifyTime, QString::number(docData.modifyTime).toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::OcrText::kBirthTime, QString::number(docData.birthTime).toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::OcrText::kFileSize, QString::number(docData.fileSize).toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::OcrText::kCheckSum, + QStringLiteral("checksum-%1").arg(docData.filename).toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + return doc; +} + +void createContentIndex(const QString &indexDir, const QList &documents) +{ + QDir().mkpath(indexDir); + + IndexWriterPtr writer = newLucene( + FSDirectory::open(indexDir.toStdWString()), + newLucene(1, 2), + true, + IndexWriter::MaxFieldLengthLIMITED); + + for (const TestDocument &doc : documents) { + writer->addDocument(buildDocument(doc)); + } + + writer->close(); +} + +void createOcrIndex(const QString &indexDir, const QList &documents) +{ + QDir().mkpath(indexDir); + + IndexWriterPtr writer = newLucene( + FSDirectory::open(indexDir.toStdWString()), + newLucene(1, 2), + true, + IndexWriter::MaxFieldLengthLIMITED); + + for (const TestDocument &doc : documents) { + writer->addDocument(buildOcrDocument(doc)); + } + + writer->close(); +} + +SearchOptions createBaseOptions(const QString &searchPath, const QString &indexDir) +{ + (void) indexDir; + SearchOptions options; + options.setSearchMethod(SearchMethod::Indexed); + options.setSearchPath(searchPath); + options.setSyncSearchTimeout(5); + return options; +} + +QStringList resultPaths(const SearchResultExpected &expected) +{ + QStringList paths; + const SearchResultList results = expected.value(); + for (const SearchResult &result : results) { + paths.append(result.path()); + } + return paths; +} + +} // namespace + +class tst_ContentSearchEngine : public QObject +{ + Q_OBJECT + +private Q_SLOTS: + void search_simpleContent_usesTemporaryIndex(); + void search_booleanAnd_matchesContentOnly(); + void search_booleanOr_matchesAnyContent(); + void search_filenameAndContentMixed_requiresBoth(); + void search_mixedAnd_excludesPureFilenameOnlyMatches(); + void search_simpleOcr_usesTemporaryIndex(); + void search_ocrBooleanAnd_matchesOcrContentOnly(); + void search_ocrBooleanOr_matchesAnyOcrContent(); + void search_ocrFilenameAndContentMixed_requiresBoth(); + void search_ocrMixedAnd_excludesPureFilenameOnlyMatches(); +}; + +void tst_ContentSearchEngine::search_simpleContent_usesTemporaryIndex() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + const QString indexDir = tempDir.path() + "/content-index"; + QVERIFY(QDir().mkpath(rootDir)); + + createContentIndex(indexDir, { + { rootDir + "/alpha-report.txt", "alpha-report.txt", "alpha budget summary", rootDir }, + { rootDir + "/meeting-notes.txt", "meeting-notes.txt", "meeting notes and timeline", rootDir }, + }); + + stub_ext::StubExt stub; + stub.set_lamda(DFMSEARCH::Global::contentIndexDirectory, [&indexDir]() { + return indexDir; + }); + + std::unique_ptr engine(SearchEngine::create(SearchType::Content)); + engine->setSearchOptions(createBaseOptions(rootDir, indexDir)); + + const SearchResultExpected expected = engine->searchSync(SearchQuery::createSimpleQuery("budget")); + QVERIFY(expected.hasValue()); + + QCOMPARE(resultPaths(expected), QStringList { rootDir + "/alpha-report.txt" }); +} + +void tst_ContentSearchEngine::search_booleanAnd_matchesContentOnly() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + const QString indexDir = tempDir.path() + "/content-index"; + QVERIFY(QDir().mkpath(rootDir)); + + createContentIndex(indexDir, { + { rootDir + "/roadmap.txt", "roadmap.txt", "alpha budget roadmap", rootDir }, + { rootDir + "/budget-only.txt", "budget-only.txt", "budget only", rootDir }, + { rootDir + "/alpha-only.txt", "alpha-only.txt", "alpha only", rootDir }, + }); + + stub_ext::StubExt stub; + stub.set_lamda(DFMSEARCH::Global::contentIndexDirectory, [&indexDir]() { + return indexDir; + }); + + std::unique_ptr engine(SearchEngine::create(SearchType::Content)); + engine->setSearchOptions(createBaseOptions(rootDir, indexDir)); + + const SearchResultExpected expected = engine->searchSync( + SearchQuery::createBooleanQuery({ "alpha", "budget" }, SearchQuery::BooleanOperator::AND)); + QVERIFY(expected.hasValue()); + + QCOMPARE(resultPaths(expected), QStringList { rootDir + "/roadmap.txt" }); +} + +void tst_ContentSearchEngine::search_booleanOr_matchesAnyContent() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + const QString indexDir = tempDir.path() + "/content-index"; + QVERIFY(QDir().mkpath(rootDir)); + + createContentIndex(indexDir, { + { rootDir + "/alpha.txt", "alpha.txt", "alpha planning", rootDir }, + { rootDir + "/budget.txt", "budget.txt", "budget tracking", rootDir }, + { rootDir + "/other.txt", "other.txt", "travel notes", rootDir }, + }); + + stub_ext::StubExt stub; + stub.set_lamda(DFMSEARCH::Global::contentIndexDirectory, [&indexDir]() { + return indexDir; + }); + + std::unique_ptr engine(SearchEngine::create(SearchType::Content)); + engine->setSearchOptions(createBaseOptions(rootDir, indexDir)); + + const SearchResultExpected expected = engine->searchSync( + SearchQuery::createBooleanQuery({ "alpha", "budget" }, SearchQuery::BooleanOperator::OR)); + QVERIFY(expected.hasValue()); + + const QStringList paths = resultPaths(expected); + QCOMPARE(paths.size(), 2); + QVERIFY(paths.contains(rootDir + "/alpha.txt")); + QVERIFY(paths.contains(rootDir + "/budget.txt")); +} + +void tst_ContentSearchEngine::search_filenameAndContentMixed_requiresBoth() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + const QString indexDir = tempDir.path() + "/content-index"; + QVERIFY(QDir().mkpath(rootDir)); + + createContentIndex(indexDir, { + { rootDir + "/budget-alpha.txt", "budget-alpha.txt", "alpha roadmap", rootDir }, + { rootDir + "/budget-gamma.txt", "budget-gamma.txt", "gamma roadmap", rootDir }, + { rootDir + "/alpha-only.txt", "alpha-only.txt", "alpha roadmap", rootDir }, + }); + + stub_ext::StubExt stub; + stub.set_lamda(DFMSEARCH::Global::contentIndexDirectory, [&indexDir]() { + return indexDir; + }); + + SearchOptions options = createBaseOptions(rootDir, indexDir); + ContentOptionsAPI contentOptions(options); + contentOptions.setFilenameKeyword("budget"); + + std::unique_ptr engine(SearchEngine::create(SearchType::Content)); + engine->setSearchOptions(options); + + const SearchResultExpected expected = engine->searchSync(SearchQuery::createSimpleQuery("alpha")); + QVERIFY(expected.hasValue()); + + QCOMPARE(resultPaths(expected), QStringList { rootDir + "/budget-alpha.txt" }); +} + +void tst_ContentSearchEngine::search_mixedAnd_excludesPureFilenameOnlyMatches() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + const QString indexDir = tempDir.path() + "/content-index"; + QVERIFY(QDir().mkpath(rootDir)); + + createContentIndex(indexDir, { + { rootDir + "/alpha-budget.txt", "alpha-budget.txt", "general notes", rootDir }, + { rootDir + "/alpha-plan.txt", "alpha-plan.txt", "budget implementation details", rootDir }, + { rootDir + "/budget-plan.txt", "budget-plan.txt", "alpha implementation details", rootDir }, + { rootDir + "/alpha-budget-content.txt", "alpha-budget-content.txt", "alpha budget implementation", rootDir }, + }); + + stub_ext::StubExt stub; + stub.set_lamda(DFMSEARCH::Global::contentIndexDirectory, [&indexDir]() { + return indexDir; + }); + + SearchOptions options = createBaseOptions(rootDir, indexDir); + ContentOptionsAPI contentOptions(options); + contentOptions.setFilenameContentMixedAndSearchEnabled(true); + + std::unique_ptr engine(SearchEngine::create(SearchType::Content)); + engine->setSearchOptions(options); + + const SearchResultExpected expected = engine->searchSync( + SearchQuery::createBooleanQuery({ "alpha", "budget" }, SearchQuery::BooleanOperator::AND)); + QVERIFY(expected.hasValue()); + + const QStringList paths = resultPaths(expected); + QCOMPARE(paths.size(), 3); + QVERIFY(!paths.contains(rootDir + "/alpha-budget.txt")); + QVERIFY(paths.contains(rootDir + "/alpha-plan.txt")); + QVERIFY(paths.contains(rootDir + "/budget-plan.txt")); + QVERIFY(paths.contains(rootDir + "/alpha-budget-content.txt")); +} + +void tst_ContentSearchEngine::search_simpleOcr_usesTemporaryIndex() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/ocr-docs"; + const QString indexDir = tempDir.path() + "/ocr-index"; + QVERIFY(QDir().mkpath(rootDir)); + + createOcrIndex(indexDir, { + { rootDir + "/scan-a.png", "scan-a.png", "invoice amount recognized", rootDir }, + { rootDir + "/scan-b.png", "scan-b.png", "meeting room whiteboard", rootDir }, + }); + + stub_ext::StubExt stub; + stub.set_lamda(DFMSEARCH::Global::ocrTextIndexDirectory, [&indexDir]() { + return indexDir; + }); + + std::unique_ptr engine(SearchEngine::create(SearchType::Ocr)); + engine->setSearchOptions(createBaseOptions(rootDir, indexDir)); + + const SearchResultExpected expected = engine->searchSync(SearchQuery::createSimpleQuery("invoice")); + QVERIFY(expected.hasValue()); + + QCOMPARE(resultPaths(expected), QStringList { rootDir + "/scan-a.png" }); +} + +void tst_ContentSearchEngine::search_ocrBooleanAnd_matchesOcrContentOnly() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/ocr-docs"; + const QString indexDir = tempDir.path() + "/ocr-index"; + QVERIFY(QDir().mkpath(rootDir)); + + createOcrIndex(indexDir, { + { rootDir + "/receipt.png", "receipt.png", "invoice amount total", rootDir }, + { rootDir + "/amount-only.png", "amount-only.png", "amount only", rootDir }, + { rootDir + "/invoice-only.png", "invoice-only.png", "invoice only", rootDir }, + }); + + stub_ext::StubExt stub; + stub.set_lamda(DFMSEARCH::Global::ocrTextIndexDirectory, [&indexDir]() { + return indexDir; + }); + + std::unique_ptr engine(SearchEngine::create(SearchType::Ocr)); + engine->setSearchOptions(createBaseOptions(rootDir, indexDir)); + + const SearchResultExpected expected = engine->searchSync( + SearchQuery::createBooleanQuery({ "invoice", "amount" }, SearchQuery::BooleanOperator::AND)); + QVERIFY(expected.hasValue()); + + QCOMPARE(resultPaths(expected), QStringList { rootDir + "/receipt.png" }); +} + +void tst_ContentSearchEngine::search_ocrBooleanOr_matchesAnyOcrContent() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/ocr-docs"; + const QString indexDir = tempDir.path() + "/ocr-index"; + QVERIFY(QDir().mkpath(rootDir)); + + createOcrIndex(indexDir, { + { rootDir + "/invoice.png", "invoice.png", "invoice recognized", rootDir }, + { rootDir + "/budget.png", "budget.png", "budget recognized", rootDir }, + { rootDir + "/other.png", "other.png", "travel receipt", rootDir }, + }); + + stub_ext::StubExt stub; + stub.set_lamda(DFMSEARCH::Global::ocrTextIndexDirectory, [&indexDir]() { + return indexDir; + }); + + std::unique_ptr engine(SearchEngine::create(SearchType::Ocr)); + engine->setSearchOptions(createBaseOptions(rootDir, indexDir)); + + const SearchResultExpected expected = engine->searchSync( + SearchQuery::createBooleanQuery({ "invoice", "budget" }, SearchQuery::BooleanOperator::OR)); + QVERIFY(expected.hasValue()); + + const QStringList paths = resultPaths(expected); + QCOMPARE(paths.size(), 2); + QVERIFY(paths.contains(rootDir + "/invoice.png")); + QVERIFY(paths.contains(rootDir + "/budget.png")); +} + +void tst_ContentSearchEngine::search_ocrFilenameAndContentMixed_requiresBoth() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/ocr-docs"; + const QString indexDir = tempDir.path() + "/ocr-index"; + QVERIFY(QDir().mkpath(rootDir)); + + createOcrIndex(indexDir, { + { rootDir + "/budget-invoice.png", "budget-invoice.png", "invoice details", rootDir }, + { rootDir + "/budget-other.png", "budget-other.png", "other details", rootDir }, + { rootDir + "/invoice-only.png", "invoice-only.png", "invoice details", rootDir }, + }); + + stub_ext::StubExt stub; + stub.set_lamda(DFMSEARCH::Global::ocrTextIndexDirectory, [&indexDir]() { + return indexDir; + }); + + SearchOptions options = createBaseOptions(rootDir, indexDir); + OcrTextOptionsAPI ocrOptions(options); + ocrOptions.setFilenameKeyword("budget"); + + std::unique_ptr engine(SearchEngine::create(SearchType::Ocr)); + engine->setSearchOptions(options); + + const SearchResultExpected expected = engine->searchSync(SearchQuery::createSimpleQuery("invoice")); + QVERIFY(expected.hasValue()); + + QCOMPARE(resultPaths(expected), QStringList { rootDir + "/budget-invoice.png" }); +} + +void tst_ContentSearchEngine::search_ocrMixedAnd_excludesPureFilenameOnlyMatches() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/ocr-docs"; + const QString indexDir = tempDir.path() + "/ocr-index"; + QVERIFY(QDir().mkpath(rootDir)); + + createOcrIndex(indexDir, { + { rootDir + "/invoice-budget.png", "invoice-budget.png", "generic text", rootDir }, + { rootDir + "/invoice-note.png", "invoice-note.png", "budget recognized", rootDir }, + { rootDir + "/budget-note.png", "budget-note.png", "invoice recognized", rootDir }, + { rootDir + "/invoice-budget-content.png", "invoice-budget-content.png", "invoice budget recognized", rootDir }, + }); + + stub_ext::StubExt stub; + stub.set_lamda(DFMSEARCH::Global::ocrTextIndexDirectory, [&indexDir]() { + return indexDir; + }); + + SearchOptions options = createBaseOptions(rootDir, indexDir); + OcrTextOptionsAPI ocrOptions(options); + ocrOptions.setFilenameOcrContentMixedAndSearchEnabled(true); + + std::unique_ptr engine(SearchEngine::create(SearchType::Ocr)); + engine->setSearchOptions(options); + + const SearchResultExpected expected = engine->searchSync( + SearchQuery::createBooleanQuery({ "invoice", "budget" }, SearchQuery::BooleanOperator::AND)); + QVERIFY(expected.hasValue()); + + const QStringList paths = resultPaths(expected); + QCOMPARE(paths.size(), 3); + QVERIFY(!paths.contains(rootDir + "/invoice-budget.png")); + QVERIFY(paths.contains(rootDir + "/invoice-note.png")); + QVERIFY(paths.contains(rootDir + "/budget-note.png")); + QVERIFY(paths.contains(rootDir + "/invoice-budget-content.png")); +} + +QObject *create_tst_ContentSearchEngine() +{ + return new tst_ContentSearchEngine(); +} + +#include "tst_content_search_engine.moc" diff --git a/autotests/dfm-search-tests/tst_filename_search_engine.cpp b/autotests/dfm-search-tests/tst_filename_search_engine.cpp new file mode 100644 index 00000000..c599b6c3 --- /dev/null +++ b/autotests/dfm-search-tests/tst_filename_search_engine.cpp @@ -0,0 +1,919 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +using namespace DFMSEARCH; +using namespace Lucene; + +namespace { + +struct TestDocument +{ + QString path; + QString filename; + QString fileType; + QString fileExt; + QString pinyin; + QString pinyinAcronym; + QString hidden = "N"; + qint64 modifyTime = 1710000000; + qint64 birthTime = 1700000000; + qint64 fileSize = 1024; + QString fileSizeStr = "1 KB"; +}; + +QStringList ancestorPathsForDocument(const QString &path) +{ + QStringList ancestors; + QFileInfo info(path); + QDir dir = info.dir(); + + while (dir.exists()) { + const QString current = QDir::cleanPath(dir.absolutePath()); + ancestors.append(current); + if (!dir.cdUp()) { + break; + } + } + + ancestors.removeDuplicates(); + return ancestors; +} + +DocumentPtr buildDocument(const TestDocument &docData) +{ + DocumentPtr doc = newLucene(); + + doc->add(newLucene(LuceneFieldNames::FileName::kFullPath, docData.path.toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::FileName::kFileName, docData.filename.toStdWString(), + Field::STORE_YES, Field::INDEX_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::FileName::kFileNameLower, docData.filename.toLower().toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::FileName::kFileType, docData.fileType.toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::FileName::kFileExt, docData.fileExt.toLower().toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::FileName::kPinyin, docData.pinyin.toStdWString(), + Field::STORE_YES, Field::INDEX_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::FileName::kPinyinAcronym, docData.pinyinAcronym.toStdWString(), + Field::STORE_YES, Field::INDEX_ANALYZED)); + doc->add(newLucene(LuceneFieldNames::FileName::kIsHidden, docData.hidden.toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + NumericFieldPtr modifyTimeField = newLucene(LuceneFieldNames::FileName::kModifyTime, + Field::STORE_YES, true); + modifyTimeField->setLongValue(docData.modifyTime); + doc->add(modifyTimeField); + + NumericFieldPtr birthTimeField = newLucene(LuceneFieldNames::FileName::kBirthTime, + Field::STORE_YES, true); + birthTimeField->setLongValue(docData.birthTime); + doc->add(birthTimeField); + + NumericFieldPtr fileSizeField = newLucene(LuceneFieldNames::FileName::kFileSize, + Field::STORE_YES, true); + fileSizeField->setLongValue(docData.fileSize); + doc->add(fileSizeField); + doc->add(newLucene(LuceneFieldNames::FileName::kFileSizeStr, docData.fileSizeStr.toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + + for (const QString &ancestor : ancestorPathsForDocument(docData.path)) { + doc->add(newLucene(LuceneFieldNames::FileName::kAncestorPaths, ancestor.toStdWString(), + Field::STORE_YES, Field::INDEX_NOT_ANALYZED)); + } + + return doc; +} + +void createFileNameIndex(const QString &indexDir, const QList &documents) +{ + QDir().mkpath(indexDir); + + IndexWriterPtr writer = newLucene( + FSDirectory::open(indexDir.toStdWString()), + newLucene(1, 2), + true, + IndexWriter::MaxFieldLengthLIMITED); + + for (const TestDocument &doc : documents) { + writer->addDocument(buildDocument(doc)); + } + + writer->close(); +} + +SearchOptions createBaseOptions(const QString &searchPath) +{ + SearchOptions options; + options.setSearchMethod(SearchMethod::Indexed); + options.setSearchPath(searchPath); + options.setSyncSearchTimeout(5); + return options; +} + +SearchOptions createRealtimeOptions(const QString &searchPath) +{ + SearchOptions options = createBaseOptions(searchPath); + options.setSearchMethod(SearchMethod::Realtime); + return options; +} + +bool createFileWithSize(const QString &path, qint64 size) +{ + QFile file(path); + if (!file.open(QIODevice::WriteOnly)) { + return false; + } + + if (size > 0) { + file.write(QByteArray(static_cast(size), 'a')); + } + + file.close(); + return true; +} + +QStringList resultPaths(const SearchResultExpected &expected) +{ + QStringList paths; + const SearchResultList results = expected.value(); + for (const SearchResult &result : results) { + paths.append(result.path()); + } + return paths; +} + +SearchQuery createWildcardQuery(const QString &pattern) +{ + SearchQuery query(pattern, SearchQuery::Type::Wildcard); + return query; +} + +} // namespace + +class tst_FileNameSearchEngine : public QObject +{ + Q_OBJECT + +private Q_SLOTS: + void search_simpleKeyword_matchesIndexedFilename(); + void search_booleanAnd_requiresAllTerms(); + void search_booleanOr_matchesAnyTerm(); + void search_wildcard_matchesByPattern(); + void search_fileTypeFilterOnly_returnsAllMatchingTypes(); + void search_extensionFilterOnly_returnsAllMatchingSuffixes(); + void search_keywordAndTypeFilter_requiresBoth(); + void search_keywordAndExtensionFilter_requiresBoth(); + void search_hiddenFiles_excludedByDefault_andIncludedWhenEnabled(); + void search_excludedPath_filtersSubtreeAtQueryLayer(); + void search_sizeAndTimeFilters_applyOnIndexedFields(); + void search_pinyinAndAcronym_queriesMatchIndexedFields(); + void search_detailedResults_populatesExtendedAttributes(); + void search_emptyKeywordWithoutFilters_returnsValidationError(); + void search_invalidFileType_returnsValidationError(); + void realtime_simpleKeyword_matchesFilesystemEntries(); + void realtime_booleanAndOr_andWildcard_queriesWork(); + void realtime_extensionFilters_areApplied(); + void realtime_hiddenAndExcludedPath_filtersWork(); + void realtime_sizeAndTimeFilters_applyWithoutIndex(); + void realtime_detailedResults_populateAttributes(); + void realtime_pinyinOption_doesNotProducePinyinMatches(); +}; + +void tst_FileNameSearchEngine::search_simpleKeyword_matchesIndexedFilename() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + const QString indexDir = tempDir.path() + "/filename-index"; + QVERIFY(QDir().mkpath(rootDir)); + + createFileNameIndex(indexDir, { + { rootDir + "/alpha-report.txt", "alpha-report.txt", "doc", "txt" }, + { rootDir + "/meeting-notes.txt", "meeting-notes.txt", "doc", "txt" }, + }); + + stub_ext::StubExt stub; + stub.set_lamda(DFMSEARCH::Global::fileNameIndexDirectory, [&indexDir]() { + return indexDir; + }); + + std::unique_ptr engine(SearchEngine::create(SearchType::FileName)); + engine->setSearchOptions(createBaseOptions(rootDir)); + + const SearchResultExpected expected = engine->searchSync(SearchQuery::createSimpleQuery("report")); + QVERIFY(expected.hasValue()); + + QCOMPARE(resultPaths(expected), QStringList { rootDir + "/alpha-report.txt" }); +} + +void tst_FileNameSearchEngine::search_booleanAnd_requiresAllTerms() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + const QString indexDir = tempDir.path() + "/filename-index"; + QVERIFY(QDir().mkpath(rootDir)); + + createFileNameIndex(indexDir, { + { rootDir + "/alpha-budget.txt", "alpha-budget.txt", "doc", "txt" }, + { rootDir + "/alpha-only.txt", "alpha-only.txt", "doc", "txt" }, + { rootDir + "/budget-only.txt", "budget-only.txt", "doc", "txt" }, + }); + + stub_ext::StubExt stub; + stub.set_lamda(DFMSEARCH::Global::fileNameIndexDirectory, [&indexDir]() { + return indexDir; + }); + + std::unique_ptr engine(SearchEngine::create(SearchType::FileName)); + engine->setSearchOptions(createBaseOptions(rootDir)); + + const SearchResultExpected expected = engine->searchSync( + SearchQuery::createBooleanQuery({ "alpha", "budget" }, SearchQuery::BooleanOperator::AND)); + QVERIFY(expected.hasValue()); + + QCOMPARE(resultPaths(expected), QStringList { rootDir + "/alpha-budget.txt" }); +} + +void tst_FileNameSearchEngine::search_booleanOr_matchesAnyTerm() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + const QString indexDir = tempDir.path() + "/filename-index"; + QVERIFY(QDir().mkpath(rootDir)); + + createFileNameIndex(indexDir, { + { rootDir + "/alpha.txt", "alpha.txt", "doc", "txt" }, + { rootDir + "/budget.xlsx", "budget.xlsx", "doc", "xlsx" }, + { rootDir + "/travel.jpg", "travel.jpg", "pic", "jpg" }, + }); + + stub_ext::StubExt stub; + stub.set_lamda(DFMSEARCH::Global::fileNameIndexDirectory, [&indexDir]() { + return indexDir; + }); + + std::unique_ptr engine(SearchEngine::create(SearchType::FileName)); + engine->setSearchOptions(createBaseOptions(rootDir)); + + const SearchResultExpected expected = engine->searchSync( + SearchQuery::createBooleanQuery({ "alpha", "budget" }, SearchQuery::BooleanOperator::OR)); + QVERIFY(expected.hasValue()); + + const QStringList paths = resultPaths(expected); + QCOMPARE(paths.size(), 2); + QVERIFY(paths.contains(rootDir + "/alpha.txt")); + QVERIFY(paths.contains(rootDir + "/budget.xlsx")); +} + +void tst_FileNameSearchEngine::search_wildcard_matchesByPattern() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + const QString indexDir = tempDir.path() + "/filename-index"; + QVERIFY(QDir().mkpath(rootDir)); + + createFileNameIndex(indexDir, { + { rootDir + "/Budget-2026.txt", "Budget-2026.txt", "doc", "txt" }, + { rootDir + "/Budget-2025.txt", "Budget-2025.txt", "doc", "txt" }, + { rootDir + "/Notes-2026.txt", "Notes-2026.txt", "doc", "txt" }, + }); + + stub_ext::StubExt stub; + stub.set_lamda(DFMSEARCH::Global::fileNameIndexDirectory, [&indexDir]() { + return indexDir; + }); + + SearchOptions options = createBaseOptions(rootDir); + options.setCaseSensitive(false); + + std::unique_ptr engine(SearchEngine::create(SearchType::FileName)); + engine->setSearchOptions(options); + + const SearchResultExpected expected = engine->searchSync(createWildcardQuery("budget-202?.txt")); + QVERIFY(expected.hasValue()); + + const QStringList paths = resultPaths(expected); + QCOMPARE(paths.size(), 2); + QVERIFY(paths.contains(rootDir + "/Budget-2026.txt")); + QVERIFY(paths.contains(rootDir + "/Budget-2025.txt")); +} + +void tst_FileNameSearchEngine::search_fileTypeFilterOnly_returnsAllMatchingTypes() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + const QString indexDir = tempDir.path() + "/filename-index"; + QVERIFY(QDir().mkpath(rootDir)); + + createFileNameIndex(indexDir, { + { rootDir + "/report.txt", "report.txt", "doc", "txt" }, + { rootDir + "/slides.pptx", "slides.pptx", "doc", "pptx" }, + { rootDir + "/holiday.jpg", "holiday.jpg", "pic", "jpg" }, + }); + + stub_ext::StubExt stub; + stub.set_lamda(DFMSEARCH::Global::fileNameIndexDirectory, [&indexDir]() { + return indexDir; + }); + + SearchOptions options = createBaseOptions(rootDir); + FileNameOptionsAPI api(options); + api.setFileTypes({ "doc" }); + + std::unique_ptr engine(SearchEngine::create(SearchType::FileName)); + engine->setSearchOptions(options); + + const SearchResultExpected expected = engine->searchSync(SearchQuery::createSimpleQuery(QString())); + QVERIFY(expected.hasValue()); + + const QStringList paths = resultPaths(expected); + QCOMPARE(paths.size(), 2); + QVERIFY(paths.contains(rootDir + "/report.txt")); + QVERIFY(paths.contains(rootDir + "/slides.pptx")); +} + +void tst_FileNameSearchEngine::search_extensionFilterOnly_returnsAllMatchingSuffixes() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + const QString indexDir = tempDir.path() + "/filename-index"; + QVERIFY(QDir().mkpath(rootDir)); + + createFileNameIndex(indexDir, { + { rootDir + "/one.txt", "one.txt", "doc", "txt" }, + { rootDir + "/two.TXT", "two.TXT", "doc", "txt" }, + { rootDir + "/three.md", "three.md", "doc", "md" }, + }); + + stub_ext::StubExt stub; + stub.set_lamda(DFMSEARCH::Global::fileNameIndexDirectory, [&indexDir]() { + return indexDir; + }); + + SearchOptions options = createBaseOptions(rootDir); + FileNameOptionsAPI api(options); + api.setFileExtensions({ "txt" }); + + std::unique_ptr engine(SearchEngine::create(SearchType::FileName)); + engine->setSearchOptions(options); + + const SearchResultExpected expected = engine->searchSync(SearchQuery::createSimpleQuery(QString())); + QVERIFY(expected.hasValue()); + + const QStringList paths = resultPaths(expected); + QCOMPARE(paths.size(), 2); + QVERIFY(paths.contains(rootDir + "/one.txt")); + QVERIFY(paths.contains(rootDir + "/two.TXT")); +} + +void tst_FileNameSearchEngine::search_keywordAndTypeFilter_requiresBoth() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + const QString indexDir = tempDir.path() + "/filename-index"; + QVERIFY(QDir().mkpath(rootDir)); + + createFileNameIndex(indexDir, { + { rootDir + "/budget.txt", "budget.txt", "doc", "txt" }, + { rootDir + "/budget.jpg", "budget.jpg", "pic", "jpg" }, + { rootDir + "/notes.txt", "notes.txt", "doc", "txt" }, + }); + + stub_ext::StubExt stub; + stub.set_lamda(DFMSEARCH::Global::fileNameIndexDirectory, [&indexDir]() { + return indexDir; + }); + + SearchOptions options = createBaseOptions(rootDir); + FileNameOptionsAPI api(options); + api.setFileTypes({ "doc" }); + + std::unique_ptr engine(SearchEngine::create(SearchType::FileName)); + engine->setSearchOptions(options); + + const SearchResultExpected expected = engine->searchSync(SearchQuery::createSimpleQuery("budget")); + QVERIFY(expected.hasValue()); + + QCOMPARE(resultPaths(expected), QStringList { rootDir + "/budget.txt" }); +} + +void tst_FileNameSearchEngine::search_keywordAndExtensionFilter_requiresBoth() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + const QString indexDir = tempDir.path() + "/filename-index"; + QVERIFY(QDir().mkpath(rootDir)); + + createFileNameIndex(indexDir, { + { rootDir + "/budget.txt", "budget.txt", "doc", "txt" }, + { rootDir + "/budget.md", "budget.md", "doc", "md" }, + { rootDir + "/summary.txt", "summary.txt", "doc", "txt" }, + }); + + stub_ext::StubExt stub; + stub.set_lamda(DFMSEARCH::Global::fileNameIndexDirectory, [&indexDir]() { + return indexDir; + }); + + SearchOptions options = createBaseOptions(rootDir); + FileNameOptionsAPI api(options); + api.setFileExtensions({ "txt" }); + + std::unique_ptr engine(SearchEngine::create(SearchType::FileName)); + engine->setSearchOptions(options); + + const SearchResultExpected expected = engine->searchSync(SearchQuery::createSimpleQuery("budget")); + QVERIFY(expected.hasValue()); + + QCOMPARE(resultPaths(expected), QStringList { rootDir + "/budget.txt" }); +} + +void tst_FileNameSearchEngine::search_hiddenFiles_excludedByDefault_andIncludedWhenEnabled() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + const QString indexDir = tempDir.path() + "/filename-index"; + QVERIFY(QDir().mkpath(rootDir)); + + createFileNameIndex(indexDir, { + { rootDir + "/visible-plan.txt", "visible-plan.txt", "doc", "txt", "", "", "N" }, + { rootDir + "/.hidden-plan.txt", ".hidden-plan.txt", "doc", "txt", "", "", "Y" }, + }); + + stub_ext::StubExt stub; + stub.set_lamda(DFMSEARCH::Global::fileNameIndexDirectory, [&indexDir]() { + return indexDir; + }); + + std::unique_ptr engine(SearchEngine::create(SearchType::FileName)); + + SearchOptions defaultOptions = createBaseOptions(rootDir); + engine->setSearchOptions(defaultOptions); + const SearchResultExpected defaultExpected = engine->searchSync(SearchQuery::createSimpleQuery("plan")); + QVERIFY(defaultExpected.hasValue()); + QCOMPARE(resultPaths(defaultExpected), QStringList { rootDir + "/visible-plan.txt" }); + + SearchOptions includeHiddenOptions = createBaseOptions(rootDir); + includeHiddenOptions.setIncludeHidden(true); + engine->setSearchOptions(includeHiddenOptions); + const SearchResultExpected includeHiddenExpected = engine->searchSync(SearchQuery::createSimpleQuery("plan")); + QVERIFY(includeHiddenExpected.hasValue()); + QCOMPARE(resultPaths(includeHiddenExpected).size(), 2); +} + +void tst_FileNameSearchEngine::search_excludedPath_filtersSubtreeAtQueryLayer() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + const QString includedDir = rootDir + "/included"; + const QString excludedDir = rootDir + "/excluded"; + const QString indexDir = tempDir.path() + "/filename-index"; + QVERIFY(QDir().mkpath(includedDir)); + QVERIFY(QDir().mkpath(excludedDir)); + + createFileNameIndex(indexDir, { + { includedDir + "/budget.txt", "budget.txt", "doc", "txt" }, + { excludedDir + "/budget.txt", "budget.txt", "doc", "txt" }, + }); + + stub_ext::StubExt stub; + stub.set_lamda(DFMSEARCH::Global::fileNameIndexDirectory, [&indexDir]() { + return indexDir; + }); + + SearchOptions options = createBaseOptions(rootDir); + options.setSearchExcludedPaths({ excludedDir }); + + std::unique_ptr engine(SearchEngine::create(SearchType::FileName)); + engine->setSearchOptions(options); + + const SearchResultExpected expected = engine->searchSync(SearchQuery::createSimpleQuery("budget")); + QVERIFY(expected.hasValue()); + + QCOMPARE(resultPaths(expected), QStringList { includedDir + "/budget.txt" }); +} + +void tst_FileNameSearchEngine::search_sizeAndTimeFilters_applyOnIndexedFields() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + const QString indexDir = tempDir.path() + "/filename-index"; + QVERIFY(QDir().mkpath(rootDir)); + + createFileNameIndex(indexDir, { + { rootDir + "/recent-large.txt", "recent-large.txt", "doc", "txt", "", "", "N", 1712000000, 1700000000, 4096, "4 KB" }, + { rootDir + "/recent-small.txt", "recent-small.txt", "doc", "txt", "", "", "N", 1712000000, 1700000000, 128, "128 B" }, + { rootDir + "/old-large.txt", "old-large.txt", "doc", "txt", "", "", "N", 1701000000, 1690000000, 4096, "4 KB" }, + }); + + stub_ext::StubExt stub; + stub.set_lamda(DFMSEARCH::Global::fileNameIndexDirectory, [&indexDir]() { + return indexDir; + }); + + SearchOptions options = createBaseOptions(rootDir); + SizeRangeFilter sizeFilter; + sizeFilter.setMin(1024); + options.setSizeRangeFilter(sizeFilter); + + TimeRangeFilter timeFilter; + timeFilter.setTimeField(TimeField::ModifyTime) + .setRange(QDateTime::fromSecsSinceEpoch(1711500000), + QDateTime::fromSecsSinceEpoch(1712500000)); + options.setTimeRangeFilter(timeFilter); + + std::unique_ptr engine(SearchEngine::create(SearchType::FileName)); + engine->setSearchOptions(options); + + const SearchResultExpected expected = engine->searchSync(SearchQuery::createSimpleQuery("recent")); + QVERIFY(expected.hasValue()); + + QCOMPARE(resultPaths(expected), QStringList { rootDir + "/recent-large.txt" }); +} + +void tst_FileNameSearchEngine::search_pinyinAndAcronym_queriesMatchIndexedFields() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + const QString indexDir = tempDir.path() + "/filename-index"; + QVERIFY(QDir().mkpath(rootDir)); + + createFileNameIndex(indexDir, { + { rootDir + "/项目计划.docx", "项目计划.docx", "doc", "docx", "xiangmujihua", "xmjh" }, + { rootDir + "/项目总结.docx", "项目总结.docx", "doc", "docx", "xiangmuzongjie", "xmzj" }, + }); + + stub_ext::StubExt stub; + stub.set_lamda(DFMSEARCH::Global::fileNameIndexDirectory, [&indexDir]() { + return indexDir; + }); + + std::unique_ptr engine(SearchEngine::create(SearchType::FileName)); + + SearchOptions pinyinOptions = createBaseOptions(rootDir); + FileNameOptionsAPI pinyinApi(pinyinOptions); + pinyinApi.setPinyinEnabled(true); + engine->setSearchOptions(pinyinOptions); + + const SearchResultExpected pinyinExpected = engine->searchSync(SearchQuery::createSimpleQuery("xiangmujihua")); + QVERIFY(pinyinExpected.hasValue()); + QCOMPARE(resultPaths(pinyinExpected), QStringList { rootDir + "/项目计划.docx" }); + + SearchOptions acronymOptions = createBaseOptions(rootDir); + FileNameOptionsAPI acronymApi(acronymOptions); + acronymApi.setPinyinAcronymEnabled(true); + engine->setSearchOptions(acronymOptions); + + const SearchResultExpected acronymExpected = engine->searchSync(SearchQuery::createSimpleQuery("xmjh")); + QVERIFY(acronymExpected.hasValue()); + QCOMPARE(resultPaths(acronymExpected), QStringList { rootDir + "/项目计划.docx" }); +} + +void tst_FileNameSearchEngine::search_detailedResults_populatesExtendedAttributes() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + const QString indexDir = tempDir.path() + "/filename-index"; + QVERIFY(QDir().mkpath(rootDir)); + + const qint64 modifyTime = 1712345678; + const qint64 birthTime = 1701234567; + createFileNameIndex(indexDir, { + { rootDir + "/archive.zip", "archive.zip", "archive", "zip", "", "", "N", modifyTime, birthTime, 2048, "2 KB" }, + }); + + stub_ext::StubExt stub; + stub.set_lamda(DFMSEARCH::Global::fileNameIndexDirectory, [&indexDir]() { + return indexDir; + }); + + SearchOptions options = createBaseOptions(rootDir); + options.setDetailedResultsEnabled(true); + + std::unique_ptr engine(SearchEngine::create(SearchType::FileName)); + engine->setSearchOptions(options); + + const SearchResultExpected expected = engine->searchSync(SearchQuery::createSimpleQuery("archive")); + QVERIFY(expected.hasValue()); + QCOMPARE(expected.value().size(), 1); + + SearchResult result = expected.value().first(); + FileNameResultAPI api(result); + QCOMPARE(result.path(), rootDir + "/archive.zip"); + QCOMPARE(api.filename(), QString("archive.zip")); + QCOMPARE(api.fileExtension(), QString("zip")); + QCOMPARE(api.fileType(), QString("archive")); + QCOMPARE(api.fileSizeBytes(), qint64(2048)); + QCOMPARE(api.size(), QString("2 KB")); + QCOMPARE(api.modifyTimestamp(), modifyTime); + QCOMPARE(api.birthTimestamp(), birthTime); + QCOMPARE(api.isDirectory(), false); + QCOMPARE(api.isHidden(), false); +} + +void tst_FileNameSearchEngine::search_emptyKeywordWithoutFilters_returnsValidationError() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + QVERIFY(QDir().mkpath(rootDir)); + + std::unique_ptr engine(SearchEngine::create(SearchType::FileName)); + engine->setSearchOptions(createBaseOptions(rootDir)); + + const SearchResultExpected expected = engine->searchSync(SearchQuery::createSimpleQuery(QString())); + QVERIFY(!expected.hasValue()); + QCOMPARE(expected.error().code().value(), static_cast(FileNameSearchErrorCode::KeywordIsEmpty)); +} + +void tst_FileNameSearchEngine::search_invalidFileType_returnsValidationError() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + QVERIFY(QDir().mkpath(rootDir)); + + SearchOptions options = createBaseOptions(rootDir); + FileNameOptionsAPI api(options); + api.setFileTypes({ "invalid-type" }); + + std::unique_ptr engine(SearchEngine::create(SearchType::FileName)); + engine->setSearchOptions(options); + + const SearchResultExpected expected = engine->searchSync(SearchQuery::createSimpleQuery("report")); + QVERIFY(!expected.hasValue()); + QCOMPARE(expected.error().code().value(), static_cast(FileNameSearchErrorCode::InvalidFileTypes)); +} + +void tst_FileNameSearchEngine::realtime_simpleKeyword_matchesFilesystemEntries() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + QVERIFY(QDir().mkpath(rootDir)); + QVERIFY(createFileWithSize(rootDir + "/alpha-report.txt", 32)); + QVERIFY(createFileWithSize(rootDir + "/meeting-notes.txt", 32)); + + std::unique_ptr engine(SearchEngine::create(SearchType::FileName)); + engine->setSearchOptions(createRealtimeOptions(rootDir)); + + const SearchResultExpected expected = engine->searchSync(SearchQuery::createSimpleQuery("report")); + QVERIFY(expected.hasValue()); + QCOMPARE(resultPaths(expected), QStringList { rootDir + "/alpha-report.txt" }); +} + +void tst_FileNameSearchEngine::realtime_booleanAndOr_andWildcard_queriesWork() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + QVERIFY(QDir().mkpath(rootDir)); + QVERIFY(createFileWithSize(rootDir + "/alpha-budget.txt", 16)); + QVERIFY(createFileWithSize(rootDir + "/alpha-only.txt", 16)); + QVERIFY(createFileWithSize(rootDir + "/budget-only.txt", 16)); + QVERIFY(createFileWithSize(rootDir + "/Budget-2026.txt", 16)); + + std::unique_ptr engine(SearchEngine::create(SearchType::FileName)); + + SearchOptions boolOptions = createRealtimeOptions(rootDir); + engine->setSearchOptions(boolOptions); + + const SearchResultExpected andExpected = engine->searchSync( + SearchQuery::createBooleanQuery({ "alpha", "budget" }, SearchQuery::BooleanOperator::AND)); + QVERIFY(andExpected.hasValue()); + QCOMPARE(resultPaths(andExpected), QStringList { rootDir + "/alpha-budget.txt" }); + + const SearchResultExpected orExpected = engine->searchSync( + SearchQuery::createBooleanQuery({ "alpha", "budget" }, SearchQuery::BooleanOperator::OR)); + QVERIFY(orExpected.hasValue()); + QCOMPARE(orExpected.value().size(), 4); + + SearchOptions wildcardOptions = createRealtimeOptions(rootDir); + wildcardOptions.setCaseSensitive(false); + engine->setSearchOptions(wildcardOptions); + + const SearchResultExpected wildcardExpected = engine->searchSync(createWildcardQuery("budget-202?.txt")); + QVERIFY(wildcardExpected.hasValue()); + QCOMPARE(resultPaths(wildcardExpected), QStringList { rootDir + "/Budget-2026.txt" }); +} + +void tst_FileNameSearchEngine::realtime_extensionFilters_areApplied() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + QVERIFY(QDir().mkpath(rootDir)); + QVERIFY(createFileWithSize(rootDir + "/report.txt", 10)); + QVERIFY(createFileWithSize(rootDir + "/slides.pptx", 10)); + QVERIFY(createFileWithSize(rootDir + "/holiday.jpg", 10)); + + std::unique_ptr engine(SearchEngine::create(SearchType::FileName)); + + SearchOptions extOptions = createRealtimeOptions(rootDir); + FileNameOptionsAPI extApi(extOptions); + extApi.setFileExtensions({ "txt", "pptx" }); + engine->setSearchOptions(extOptions); + + const SearchResultExpected extExpected = engine->searchSync(SearchQuery::createSimpleQuery(QString())); + QVERIFY(extExpected.hasValue()); + QCOMPARE(extExpected.value().size(), 2); +} + +void tst_FileNameSearchEngine::realtime_hiddenAndExcludedPath_filtersWork() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + const QString includedDir = rootDir + "/included"; + const QString excludedDir = rootDir + "/excluded"; + QVERIFY(QDir().mkpath(includedDir)); + QVERIFY(QDir().mkpath(excludedDir)); + QVERIFY(createFileWithSize(includedDir + "/visible-plan.txt", 8)); + QVERIFY(createFileWithSize(rootDir + "/.hidden-plan.txt", 8)); + QVERIFY(createFileWithSize(excludedDir + "/visible-plan.txt", 8)); + + std::unique_ptr engine(SearchEngine::create(SearchType::FileName)); + + SearchOptions defaultOptions = createRealtimeOptions(rootDir); + defaultOptions.setSearchExcludedPaths({ excludedDir }); + engine->setSearchOptions(defaultOptions); + + const SearchResultExpected defaultExpected = engine->searchSync(SearchQuery::createSimpleQuery("plan")); + QVERIFY(defaultExpected.hasValue()); + QCOMPARE(resultPaths(defaultExpected), QStringList { includedDir + "/visible-plan.txt" }); + + SearchOptions includeHiddenOptions = createRealtimeOptions(rootDir); + includeHiddenOptions.setIncludeHidden(true); + includeHiddenOptions.setSearchExcludedPaths({ excludedDir }); + engine->setSearchOptions(includeHiddenOptions); + + const SearchResultExpected includeHiddenExpected = engine->searchSync(SearchQuery::createSimpleQuery("plan")); + QVERIFY(includeHiddenExpected.hasValue()); + const QStringList paths = resultPaths(includeHiddenExpected); + QCOMPARE(paths.size(), 2); + QVERIFY(paths.contains(includedDir + "/visible-plan.txt")); + QVERIFY(paths.contains(rootDir + "/.hidden-plan.txt")); +} + +void tst_FileNameSearchEngine::realtime_sizeAndTimeFilters_applyWithoutIndex() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + QVERIFY(QDir().mkpath(rootDir)); + + const QString recentLarge = rootDir + "/recent-large.txt"; + const QString recentSmall = rootDir + "/recent-small.txt"; + const QString oldLarge = rootDir + "/old-large.txt"; + QVERIFY(createFileWithSize(recentLarge, 4096)); + QVERIFY(createFileWithSize(recentSmall, 128)); + QVERIFY(createFileWithSize(oldLarge, 4096)); + + const QDateTime recentTime = QDateTime::fromSecsSinceEpoch(1712000000); + const QDateTime oldTime = QDateTime::fromSecsSinceEpoch(1701000000); + QFile recentLargeFile(recentLarge); + QFile recentSmallFile(recentSmall); + QFile oldLargeFile(oldLarge); + QVERIFY(recentLargeFile.open(QIODevice::ReadWrite)); + QVERIFY(recentSmallFile.open(QIODevice::ReadWrite)); + QVERIFY(oldLargeFile.open(QIODevice::ReadWrite)); + QVERIFY(recentLargeFile.setFileTime(recentTime, QFileDevice::FileModificationTime)); + QVERIFY(recentSmallFile.setFileTime(recentTime, QFileDevice::FileModificationTime)); + QVERIFY(oldLargeFile.setFileTime(oldTime, QFileDevice::FileModificationTime)); + recentLargeFile.close(); + recentSmallFile.close(); + oldLargeFile.close(); + + SearchOptions options = createRealtimeOptions(rootDir); + SizeRangeFilter sizeFilter; + sizeFilter.setMin(1024); + options.setSizeRangeFilter(sizeFilter); + + TimeRangeFilter timeFilter; + timeFilter.setTimeField(TimeField::ModifyTime) + .setRange(QDateTime::fromSecsSinceEpoch(1711500000), + QDateTime::fromSecsSinceEpoch(1712500000)); + options.setTimeRangeFilter(timeFilter); + + std::unique_ptr engine(SearchEngine::create(SearchType::FileName)); + engine->setSearchOptions(options); + + const SearchResultExpected expected = engine->searchSync(SearchQuery::createSimpleQuery("recent")); + QVERIFY(expected.hasValue()); + QCOMPARE(resultPaths(expected), QStringList { recentLarge }); +} + +void tst_FileNameSearchEngine::realtime_detailedResults_populateAttributes() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + QVERIFY(QDir().mkpath(rootDir)); + const QString archivePath = rootDir + "/archive.zip"; + QVERIFY(createFileWithSize(archivePath, 2048)); + + SearchOptions options = createRealtimeOptions(rootDir); + options.setDetailedResultsEnabled(true); + + std::unique_ptr engine(SearchEngine::create(SearchType::FileName)); + engine->setSearchOptions(options); + + const SearchResultExpected expected = engine->searchSync(SearchQuery::createSimpleQuery("archive")); + QVERIFY(expected.hasValue()); + QCOMPARE(expected.value().size(), 1); + + SearchResult result = expected.value().first(); + FileNameResultAPI api(result); + QCOMPARE(result.path(), archivePath); + QCOMPARE(api.filename(), QString("archive.zip")); + QCOMPARE(api.fileExtension(), QString("zip")); + QCOMPARE(api.fileType(), QString("zip")); + QCOMPARE(api.fileSizeBytes(), qint64(2048)); + QCOMPARE(api.isDirectory(), false); + QCOMPARE(api.isHidden(), false); + QVERIFY(api.modifyTimestamp() > 0); +} + +void tst_FileNameSearchEngine::realtime_pinyinOption_doesNotProducePinyinMatches() +{ + QTemporaryDir tempDir; + QVERIFY(tempDir.isValid()); + + const QString rootDir = tempDir.path() + "/docs"; + QVERIFY(QDir().mkpath(rootDir)); + QVERIFY(createFileWithSize(rootDir + "/项目计划.docx", 32)); + + SearchOptions options = createRealtimeOptions(rootDir); + FileNameOptionsAPI api(options); + api.setPinyinEnabled(true); + + std::unique_ptr engine(SearchEngine::create(SearchType::FileName)); + engine->setSearchOptions(options); + + const SearchResultExpected expected = engine->searchSync(SearchQuery::createSimpleQuery("xiangmujihua")); + QVERIFY(expected.hasValue()); + QCOMPARE(expected.value().size(), 0); +} + +QObject *create_tst_FileNameSearchEngine() +{ + return new tst_FileNameSearchEngine(); +} + +#include "tst_filename_search_engine.moc" diff --git a/autotests/dfm-search-tests/tst_search_utils.cpp b/autotests/dfm-search-tests/tst_search_utils.cpp index 11b319e9..c962d449 100644 --- a/autotests/dfm-search-tests/tst_search_utils.cpp +++ b/autotests/dfm-search-tests/tst_search_utils.cpp @@ -9,8 +9,13 @@ #include #include +#include +#include +#include + #include #include +#include using namespace DFMSEARCH; @@ -26,6 +31,7 @@ private Q_SLOTS: void testPinyinAcronym(); void testAnythingStatus(); void testFileNameBlacklistMatcher(); + void testNGramSearchQuery(); private: void doTestPinyin(const QString &caseName, const QString &input, bool expected); @@ -331,6 +337,42 @@ void tst_SearchUtils::testGlobal() Q_UNUSED(blacklistPaths); } +void tst_SearchUtils::testNGramSearchQuery() +{ + Lucene::QueryPtr oneCharQuery = LuceneQueryUtils::buildNGramSearchQuery("contents", "A"); + Lucene::TermQueryPtr oneCharTermQuery = boost::dynamic_pointer_cast(oneCharQuery); + QVERIFY(oneCharTermQuery); + QCOMPARE(oneCharTermQuery->getTerm()->field(), Lucene::String(L"contents")); + QCOMPARE(oneCharTermQuery->getTerm()->text(), Lucene::String(L"a")); + + Lucene::QueryPtr twoCharQuery = LuceneQueryUtils::buildNGramSearchQuery("contents", "Ab"); + Lucene::TermQueryPtr twoCharTermQuery = boost::dynamic_pointer_cast(twoCharQuery); + QVERIFY(twoCharTermQuery); + QCOMPARE(twoCharTermQuery->getTerm()->text(), Lucene::String(L"ab")); + + Lucene::QueryPtr evenQuery = LuceneQueryUtils::buildNGramSearchQuery("contents", "abcdef"); + Lucene::PhraseQueryPtr evenPhraseQuery = boost::dynamic_pointer_cast(evenQuery); + QVERIFY(evenPhraseQuery); + QCOMPARE(evenPhraseQuery->getTerms().size(), 3); + QCOMPARE(evenPhraseQuery->getTerms()[0]->text(), Lucene::String(L"ab")); + QCOMPARE(evenPhraseQuery->getTerms()[1]->text(), Lucene::String(L"cd")); + QCOMPARE(evenPhraseQuery->getTerms()[2]->text(), Lucene::String(L"ef")); + QCOMPARE(evenPhraseQuery->getPositions()[0], 1); + QCOMPARE(evenPhraseQuery->getPositions()[1], 5); + QCOMPARE(evenPhraseQuery->getPositions()[2], 9); + + Lucene::QueryPtr oddQuery = LuceneQueryUtils::buildNGramSearchQuery("contents", "abcde"); + Lucene::PhraseQueryPtr oddPhraseQuery = boost::dynamic_pointer_cast(oddQuery); + QVERIFY(oddPhraseQuery); + QCOMPARE(oddPhraseQuery->getTerms().size(), 3); + QCOMPARE(oddPhraseQuery->getTerms()[0]->text(), Lucene::String(L"ab")); + QCOMPARE(oddPhraseQuery->getTerms()[1]->text(), Lucene::String(L"cd")); + QCOMPARE(oddPhraseQuery->getTerms()[2]->text(), Lucene::String(L"de")); + QCOMPARE(oddPhraseQuery->getPositions()[0], 1); + QCOMPARE(oddPhraseQuery->getPositions()[1], 5); + QCOMPARE(oddPhraseQuery->getPositions()[2], 7); +} + QObject *create_tst_SearchUtils() { return new tst_SearchUtils(); diff --git a/autotests/dfm-search-tests/tst_semantic_search.cpp b/autotests/dfm-search-tests/tst_semantic_search.cpp new file mode 100644 index 00000000..f56577cc --- /dev/null +++ b/autotests/dfm-search-tests/tst_semantic_search.cpp @@ -0,0 +1,1065 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#include +#include +#include +#include +#include +#include + +#include + +#include "semantic/semanticruleengine.h" +#include "semantic/intentparser.h" +#include "semantic/ruleconfigloader.h" +#include "semantic/extractors/keywordextractor.h" +#include "semantic/semanticquerybuilder.h" + +using namespace DFMSEARCH; + +static bool buildGroupFromJson(const QByteArray &json, RuleGroup &outGroup) +{ + QJsonDocument doc = QJsonDocument::fromJson(json); + if (!doc.isObject()) { + return false; + } + QJsonObject root = doc.object(); + QJsonArray groups = root.value("groups").toArray(); + if (groups.isEmpty()) { + return false; + } + return SemanticRuleEngine::parseRuleGroupStatic(groups.at(0).toObject(), outGroup); +} + +// Helper: build a simple rule group JSON string +static QByteArray makeRuleJson(const QString &groupName, const QString &ruleId, + const QString &pattern, int priority, + const QVariantMap &metadata = {}) +{ + QJsonObject ruleObj; + ruleObj["id"] = ruleId; + ruleObj["pattern"] = pattern; + ruleObj["enabled"] = true; + ruleObj["priority"] = priority; + if (!metadata.isEmpty()) { + ruleObj["metadata"] = QJsonObject::fromVariantMap(metadata); + } + + QJsonObject ruleGroupObj; + ruleGroupObj["name"] = groupName; + ruleGroupObj["version"] = "1.0.0"; + ruleGroupObj["rules"] = QJsonArray({ruleObj}); + + QJsonObject root; + root["groups"] = QJsonArray({ruleGroupObj}); + + return QJsonDocument(root).toJson(QJsonDocument::Compact); +} + +// ===== tst_RuleEngine ===== + +class tst_RuleEngine : public QObject +{ + Q_OBJECT + +private Q_SLOTS: + void parseValidGroup(); + void parseEmptyGroup(); + void parsePriorityOrdering(); + void matchReturnsHighestPriority(); + void matchAllReturnsAll(); + void ruleMetadataAccess(); + void hasGroupCheck(); +}; + +void tst_RuleEngine::parseValidGroup() +{ + QByteArray json = makeRuleJson("test", "r1", "hello", 100); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QCOMPARE(group.name, QString("test")); + QCOMPARE(group.rules.size(), 1); + QCOMPARE(group.rules[0].id, QString("r1")); + QVERIFY(group.rules[0].regex.isValid()); +} + +void tst_RuleEngine::parseEmptyGroup() +{ + QByteArray json = R"=====({"groups": [{"name": "empty", "rules": []}]})====="; + + RuleGroup group; + QVERIFY(!buildGroupFromJson(json, group)); +} + +void tst_RuleEngine::parsePriorityOrdering() +{ + QJsonObject r1, r2, r3; + r1["id"] = "low"; r1["pattern"] = "test"; r1["priority"] = 10; + r2["id"] = "high"; r2["pattern"] = "test"; r2["priority"] = 200; + r3["id"] = "mid"; r3["pattern"] = "test"; r3["priority"] = 100; + + QJsonObject ruleGroup; + ruleGroup["name"] = "prio"; + ruleGroup["version"] = "1.0.0"; + ruleGroup["rules"] = QJsonArray({r1, r2, r3}); + + QJsonObject root; + root["groups"] = QJsonArray({ruleGroup}); + + RuleGroup group; + QVERIFY(SemanticRuleEngine::parseRuleGroupStatic(ruleGroup, group)); + QCOMPARE(group.rules.size(), 3); + + QStringList ids; + for (const Rule &r : group.rules) { + ids.append(r.id); + } + QVERIFY(ids.contains("low")); + QVERIFY(ids.contains("mid")); + QVERIFY(ids.contains("high")); +} + +void tst_RuleEngine::matchReturnsHighestPriority() +{ + QByteArray json = makeRuleJson("test_match", "r1", "abc", 200, + {{"level", "high"}}); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QVERIFY(group.rules[0].regex.match("abc").hasMatch()); +} + +void tst_RuleEngine::matchAllReturnsAll() +{ + QJsonObject r1, r2, r3; + r1["id"] = "r1"; r1["pattern"] = "cat"; r1["priority"] = 100; + r2["id"] = "r2"; r2["pattern"] = "dog"; r2["priority"] = 100; + r3["id"] = "r3"; r3["pattern"] = "bird"; r3["priority"] = 50; + + QJsonObject ruleGroup; + ruleGroup["name"] = "test_all"; + ruleGroup["version"] = "1.0.0"; + ruleGroup["rules"] = QJsonArray({r1, r2, r3}); + + QJsonObject root; + root["groups"] = QJsonArray({ruleGroup}); + + RuleGroup group; + QVERIFY(SemanticRuleEngine::parseRuleGroupStatic(ruleGroup, group)); + QCOMPARE(group.rules.size(), 3); +} + +void tst_RuleEngine::ruleMetadataAccess() +{ + QVariantMap meta; + meta["type"] = "preset"; + meta["preset"] = "today"; + QByteArray json = makeRuleJson("test_meta", "m1", "test", 100, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QCOMPARE(group.rules[0].metadata.value("type").toString(), QString("preset")); + QCOMPARE(group.rules[0].metadata.value("preset").toString(), QString("today")); +} + +void tst_RuleEngine::hasGroupCheck() +{ + SemanticRuleEngine engine; + QVERIFY(!engine.hasGroup("time")); + QVERIFY(!engine.hasGroup("filetype")); + QCOMPARE(engine.groupNames().size(), 0); +} + +// ===== tst_TimeExtraction ===== + +class tst_TimeExtraction : public QObject +{ + Q_OBJECT + +private Q_SLOTS: + void presetToday(); + void presetYesterday(); + void presetThisWeek(); + void presetThisMonth(); + void presetThisYear(); + void presetLastYear(); + void customYear(); + void customYearMonth(); + void customFullDate(); + void noMatch(); +}; + +void tst_TimeExtraction::presetToday() +{ + QVariantMap meta; + meta["type"] = "preset"; + meta["preset"] = "today"; + QByteArray json = makeRuleJson("time", "time_today", "today", 200, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QCOMPARE(group.rules[0].metadata.value("preset").toString(), QString("today")); + QVERIFY(group.rules[0].regex.isValid()); + QVERIFY(group.rules[0].regex.match("today").hasMatch()); + QVERIFY(!group.rules[0].regex.match("yesterday").hasMatch()); +} + +void tst_TimeExtraction::presetYesterday() +{ + QVariantMap meta; + meta["type"] = "preset"; + meta["preset"] = "yesterday"; + QByteArray json = makeRuleJson("time", "time_yesterday", "yesterday", 200, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QCOMPARE(group.rules[0].metadata.value("preset").toString(), QString("yesterday")); + QVERIFY(group.rules[0].regex.match("yesterday").hasMatch()); +} + +void tst_TimeExtraction::presetThisWeek() +{ + QVariantMap meta; + meta["type"] = "preset"; + meta["preset"] = "this_week"; + QByteArray json = makeRuleJson("time", "time_this_week", "this_week", 190, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QCOMPARE(group.rules[0].metadata.value("preset").toString(), QString("this_week")); +} + +void tst_TimeExtraction::presetThisMonth() +{ + QVariantMap meta; + meta["type"] = "preset"; + meta["preset"] = "this_month"; + QByteArray json = makeRuleJson("time", "time_this_month", "this_month", 180, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QCOMPARE(group.rules[0].metadata.value("preset").toString(), QString("this_month")); +} + +void tst_TimeExtraction::presetThisYear() +{ + QVariantMap meta; + meta["type"] = "preset"; + meta["preset"] = "this_year"; + QByteArray json = makeRuleJson("time", "time_this_year", "this_year", 170, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QCOMPARE(group.rules[0].metadata.value("preset").toString(), QString("this_year")); +} + +void tst_TimeExtraction::presetLastYear() +{ + QVariantMap meta; + meta["type"] = "preset"; + meta["preset"] = "last_year"; + QByteArray json = makeRuleJson("time", "time_last_year", "last_year", 170, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QCOMPARE(group.rules[0].metadata.value("preset").toString(), QString("last_year")); +} + +void tst_TimeExtraction::customYear() +{ + // Use programmatic JSON to avoid raw string delimiter conflict with regex patterns + QVariantMap meta; + meta["type"] = "custom"; + meta["format"] = "year"; + QByteArray json = makeRuleJson("time", "time_exact_year", + "(?\\d{2,4})year", 160, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + + auto match = group.rules[0].regex.match("2025year"); + QVERIFY(match.hasMatch()); + QCOMPARE(match.captured("year"), QString("2025")); +} + +void tst_TimeExtraction::customYearMonth() +{ + QVariantMap meta; + meta["type"] = "custom"; + meta["format"] = "year_month"; + QByteArray json = makeRuleJson("time", "time_exact_year_month", + "(?\\d{4})-(?\\d{1,2})", 150, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + + auto match = group.rules[0].regex.match("2025-12"); + QVERIFY(match.hasMatch()); + QCOMPARE(match.captured("year"), QString("2025")); + QCOMPARE(match.captured("month"), QString("12")); +} + +void tst_TimeExtraction::customFullDate() +{ + QVariantMap meta; + meta["type"] = "custom"; + meta["format"] = "full_date"; + QByteArray json = makeRuleJson("time", "time_exact_full_date", + "(?\\d{4})-(?\\d{1,2})-(?\\d{1,2})", + 140, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + + auto match = group.rules[0].regex.match("2025-03-15"); + QVERIFY(match.hasMatch()); + QCOMPARE(match.captured("year"), QString("2025")); + QCOMPARE(match.captured("month"), QString("03")); + QCOMPARE(match.captured("day"), QString("15")); +} + +void tst_TimeExtraction::noMatch() +{ + QVariantMap meta; + meta["type"] = "preset"; + meta["preset"] = "today"; + QByteArray json = makeRuleJson("time", "time_today", "today", 200, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QVERIFY(!group.rules[0].regex.match("random text without match").hasMatch()); +} + +// ===== tst_FileTypeExtraction ===== + +class tst_FileTypeExtraction : public QObject +{ + Q_OBJECT + +private Q_SLOTS: + void precisePdf(); + void preciseWord(); + void preciseExcel(); + void precisePpt(); + void imageType(); + void videoType(); + void audioType(); + void genericDocument(); + void genericSpreadsheet(); + void genericPresentation(); + void archiveType(); + void extensionsList(); + void generalFlag(); +}; + +void tst_FileTypeExtraction::precisePdf() +{ + QVariantMap meta; + meta["extensions"] = QStringList({"pdf"}); + QByteArray json = makeRuleJson("filetype", "filetype_pdf", "pdf", 200, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QVERIFY(group.rules[0].regex.match("pdf").hasMatch()); + QCOMPARE(group.rules[0].metadata.value("extensions").toStringList(), QStringList({"pdf"})); +} + +void tst_FileTypeExtraction::preciseWord() +{ + QVariantMap meta; + meta["extensions"] = QStringList({"doc", "docx"}); + meta["fileTypes"] = QStringList({"doc"}); + QByteArray json = makeRuleJson("filetype", "filetype_word", "word|doc|docx", 200, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QVERIFY(group.rules[0].regex.match("word").hasMatch()); + QVERIFY(group.rules[0].regex.match("docx").hasMatch()); + QVERIFY(!group.rules[0].regex.match("pdf").hasMatch()); +} + +void tst_FileTypeExtraction::preciseExcel() +{ + QVariantMap meta; + meta["extensions"] = QStringList({"xls", "xlsx"}); + QByteArray json = makeRuleJson("filetype", "filetype_excel", "excel|xls|xlsx", 200, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QVERIFY(group.rules[0].regex.match("excel").hasMatch()); + QVERIFY(group.rules[0].regex.match("xlsx").hasMatch()); +} + +void tst_FileTypeExtraction::precisePpt() +{ + QVariantMap meta; + meta["extensions"] = QStringList({"ppt", "pptx"}); + QByteArray json = makeRuleJson("filetype", "filetype_ppt", "ppt|pptx", 200, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QVERIFY(group.rules[0].regex.match("ppt").hasMatch()); + QVERIFY(group.rules[0].regex.match("pptx").hasMatch()); +} + +void tst_FileTypeExtraction::imageType() +{ + QVariantMap meta; + meta["extensions"] = QStringList({"jpg", "png", "gif"}); + meta["fileTypes"] = QStringList({"pic"}); + QByteArray json = makeRuleJson("filetype", "filetype_image", "image", 150, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QCOMPARE(group.rules[0].metadata.value("fileTypes").toStringList(), QStringList({"pic"})); +} + +void tst_FileTypeExtraction::videoType() +{ + QVariantMap meta; + meta["extensions"] = QStringList({"mp4", "avi", "mkv"}); + meta["fileTypes"] = QStringList({"video"}); + QByteArray json = makeRuleJson("filetype", "filetype_video", "video", 150, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QCOMPARE(group.rules[0].metadata.value("fileTypes").toStringList(), QStringList({"video"})); +} + +void tst_FileTypeExtraction::audioType() +{ + QVariantMap meta; + meta["extensions"] = QStringList({"mp3", "wav", "flac"}); + meta["fileTypes"] = QStringList({"audio"}); + QByteArray json = makeRuleJson("filetype", "filetype_audio", "audio", 150, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QCOMPARE(group.rules[0].metadata.value("fileTypes").toStringList(), QStringList({"audio"})); +} + +void tst_FileTypeExtraction::genericDocument() +{ + QVariantMap meta; + meta["extensions"] = QStringList({"doc", "docx", "pdf", "txt"}); + meta["fileTypes"] = QStringList({"doc"}); + meta["general"] = true; + QByteArray json = makeRuleJson("filetype", "filetype_document_general", "document", 100, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QVERIFY(group.rules[0].metadata.value("general").toBool()); + QCOMPARE(group.rules[0].metadata.value("extensions").toStringList().size(), 4); +} + +void tst_FileTypeExtraction::genericSpreadsheet() +{ + QVariantMap meta; + meta["extensions"] = QStringList({"xls", "xlsx", "csv"}); + meta["general"] = true; + QByteArray json = makeRuleJson("filetype", "filetype_spreadsheet_general", "spreadsheet", 100, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QVERIFY(group.rules[0].metadata.value("general").toBool()); +} + +void tst_FileTypeExtraction::genericPresentation() +{ + QVariantMap meta; + meta["extensions"] = QStringList({"ppt", "pptx", "dps"}); + meta["general"] = true; + QByteArray json = makeRuleJson("filetype", "filetype_presentation_general", "presentation", 100, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QVERIFY(group.rules[0].metadata.value("general").toBool()); +} + +void tst_FileTypeExtraction::archiveType() +{ + QVariantMap meta; + meta["extensions"] = QStringList({"zip", "tar", "rar", "7z"}); + QByteArray json = makeRuleJson("filetype", "filetype_archive", "archive", 150, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QCOMPARE(group.rules[0].metadata.value("extensions").toStringList().size(), 4); +} + +void tst_FileTypeExtraction::extensionsList() +{ + QVariantMap meta; + meta["extensions"] = QStringList({"a", "b", "c", "d", "e"}); + QByteArray json = makeRuleJson("filetype", "ft", "test", 100, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QCOMPARE(group.rules[0].metadata.value("extensions").toStringList().size(), 5); +} + +void tst_FileTypeExtraction::generalFlag() +{ + QVariantMap meta; + meta["extensions"] = QStringList({"pdf"}); + QByteArray json = makeRuleJson("filetype", "ft_precise", "pdf", 200, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QVERIFY(!group.rules[0].metadata.value("general").toBool()); +} + +// ===== tst_KeywordExtraction ===== + +class tst_KeywordExtraction : public QObject +{ + Q_OBJECT + +private Q_SLOTS: + void containsPattern(); + void namedPattern(); + void contentHasPattern(); + void noMatch(); + void captureGroup(); + void multiKeywordFlag(); +}; + +void tst_KeywordExtraction::containsPattern() +{ + QVariantMap meta; + meta["capture_group"] = 1; + meta["multi_keyword"] = true; + QByteArray json = makeRuleJson("keyword", "keyword_contains", + "contains(.+?)(?:of|$)", 200, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + + auto match = group.rules[0].regex.match("contains meeting notes"); + QVERIFY(match.hasMatch()); + QCOMPARE(match.captured(1), QString(" meeting notes")); +} + +void tst_KeywordExtraction::namedPattern() +{ + QVariantMap meta; + meta["capture_group"] = 1; + meta["multi_keyword"] = false; + QByteArray json = makeRuleJson("keyword", "keyword_named", + "named (.+?)(?: of|$)", 200, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + + auto match = group.rules[0].regex.match("named report of"); + QVERIFY(match.hasMatch()); + QCOMPARE(match.captured(1), QString("report")); +} + +void tst_KeywordExtraction::contentHasPattern() +{ + QVariantMap meta; + meta["capture_group"] = 1; + meta["multi_keyword"] = true; + QByteArray json = makeRuleJson("keyword", "keyword_content_has", + "content(?: has| contains| includes)(.+?)(?: of|$)", + 200, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + + auto match = group.rules[0].regex.match("content includes budget data"); + QVERIFY(match.hasMatch()); + QCOMPARE(match.captured(1), QString(" budget data")); +} + +void tst_KeywordExtraction::noMatch() +{ + QVariantMap meta; + meta["capture_group"] = 1; + QByteArray json = makeRuleJson("keyword", "keyword_contains", + "contains(.+?)(?:of|$)", 200, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QVERIFY(!group.rules[0].regex.match("no keyword pattern here").hasMatch()); +} + +void tst_KeywordExtraction::captureGroup() +{ + QVariantMap meta; + meta["capture_group"] = 1; + QByteArray json = makeRuleJson("keyword", "k1", "contains(.+?)(?:of|$)", 200, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QCOMPARE(group.rules[0].metadata.value("capture_group").toInt(), 1); +} + +void tst_KeywordExtraction::multiKeywordFlag() +{ + QVariantMap meta; + meta["capture_group"] = 1; + meta["multi_keyword"] = true; + QByteArray json = makeRuleJson("keyword", "k1", "contains(.+?)(?:of|$)", 200, meta); + + RuleGroup group; + QVERIFY(buildGroupFromJson(json, group)); + QVERIFY(group.rules[0].metadata.value("multi_keyword").toBool()); +} + +// ===== tst_ParsedIntent ===== + +class tst_ParsedIntent : public QObject +{ + Q_OBJECT + +private Q_SLOTS: + void defaultState(); + void timeConstraintDefault(); + void timeConstraintPreset(); + void matchSpanValidity(); +}; + +void tst_ParsedIntent::defaultState() +{ + ParsedIntent intent; + QVERIFY(intent.timeConstraint.kind == TimeConstraintKind::None); + QVERIFY(intent.fileExtensions.isEmpty()); + QVERIFY(intent.keywords.isEmpty()); + QVERIFY(intent.consumedSpans.isEmpty()); +} + +void tst_ParsedIntent::timeConstraintDefault() +{ + TimeConstraint tc; + QVERIFY(!tc.isValid()); + QCOMPARE(tc.kind, TimeConstraintKind::None); +} + +void tst_ParsedIntent::timeConstraintPreset() +{ + TimeConstraint tc; + tc.kind = TimeConstraintKind::Preset; + tc.preset = TimePreset::Today; + QVERIFY(tc.isValid()); +} + +void tst_ParsedIntent::matchSpanValidity() +{ + MatchSpan span; + QVERIFY(!span.isValid()); + + span.start = 0; + span.end = 5; + span.ruleId = "test_rule"; + QVERIFY(span.isValid()); +} + +// ===== tst_IsSemanticQuery ===== + +namespace { + +// Resolve the source tree rule directory relative to TEST_SOURCE_DIR. +// Falls back to a heuristic path if TEST_SOURCE_DIR is not defined. +QString sourceRulesDir() +{ + QString base = QString::fromUtf8(TEST_SOURCE_DIR); + if (base.isEmpty()) { + base = QCoreApplication::applicationDirPath() + "/../../.."; + } + return base + "/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN"; +} + +// Check whether the source tree rule files exist and are loadable. +bool sourceRulesAvailable() +{ + const QString dir = sourceRulesDir(); + return QDir(dir).exists() + && !QDir(dir).entryList({"*.json"}, QDir::Files).isEmpty(); +} + +// Replicate isSemanticQuery() logic using internal components with source-tree rules. +bool checkIsSemanticQuery(SemanticRuleEngine *engine, IntentParser *parser, + const QString &input) +{ + if (input.trimmed().isEmpty()) { + return false; + } + + ParsedIntent intent; + parser->parse(input, intent); + + return intent.timeConstraint.isValid() + || intent.sizeConstraint.isValid() + || !intent.fileExtensions.isEmpty() + || !intent.searchDirectories.isEmpty(); +} + +} // namespace + +class tst_IsSemanticQuery : public QObject +{ + Q_OBJECT + +private Q_SLOTS: + void initTestCase(); + void emptyInput(); + void whitespaceOnly(); + void plainKeyword(); + void plainChineseKeyword(); + void todayKeyword(); + void yesterdayKeyword(); + void thisWeekKeyword(); + void lastMonthKeyword(); + void fileTypePdf(); + void fileTypeImage(); + void fileTypeDocument(); + void locationDesktop(); + void locationDownloads(); + void locationTrash(); + void sizeLarge(); + void sizeSmall(); + void sizeDynamic(); + void timeAndFileType(); + void locationAndTime(); + void keywordOnlyNoMatch(); + void consecutiveCalls(); + void noiseWordsOnly(); + +private: + SemanticRuleEngine *m_engine = nullptr; + IntentParser *m_parser = nullptr; +}; + +void tst_IsSemanticQuery::initTestCase() +{ + if (!sourceRulesAvailable()) { + QSKIP("Rule files not found in source tree, skipping isSemanticQuery tests"); + } + + m_engine = new SemanticRuleEngine(this); + const QString dir = sourceRulesDir(); + const QStringList ruleFiles = QDir(dir).entryList( + {"*.json"}, QDir::Files, QDir::Name); + for (const QString &filename : ruleFiles) { + QString path = dir + "/" + filename; + if (!m_engine->loadRuleFile(path)) { + qWarning() << "Failed to load rule file:" << path; + } + } + + m_parser = new IntentParser(m_engine); +} + +void tst_IsSemanticQuery::emptyInput() +{ + QVERIFY(!checkIsSemanticQuery(m_engine, m_parser, QString())); +} + +void tst_IsSemanticQuery::whitespaceOnly() +{ + QVERIFY(!checkIsSemanticQuery(m_engine, m_parser, " ")); + QVERIFY(!checkIsSemanticQuery(m_engine, m_parser, "\t\n")); +} + +void tst_IsSemanticQuery::plainKeyword() +{ + QVERIFY(!checkIsSemanticQuery(m_engine, m_parser, "hello")); + QVERIFY(!checkIsSemanticQuery(m_engine, m_parser, "meeting notes")); +} + +void tst_IsSemanticQuery::plainChineseKeyword() +{ + // Pure Chinese text without any semantic triggers. + // Avoid words that match filetype/location/time/size rules + // (e.g. "报告" matches filetype_document_general, "音乐" matches filetype_audio). + QVERIFY(!checkIsSemanticQuery(m_engine, m_parser, "蓝天白云")); + QVERIFY(!checkIsSemanticQuery(m_engine, m_parser, "春夏秋冬")); +} + +void tst_IsSemanticQuery::todayKeyword() +{ + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "今天的文件")); + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "今日份报告")); +} + +void tst_IsSemanticQuery::yesterdayKeyword() +{ + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "昨天的报告")); + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "昨晚的截图")); +} + +void tst_IsSemanticQuery::thisWeekKeyword() +{ + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "本周的文档")); + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "这周修改的")); +} + +void tst_IsSemanticQuery::lastMonthKeyword() +{ + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "上个月的文件")); +} + +void tst_IsSemanticQuery::fileTypePdf() +{ + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "pdf文档")); + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "找一下pdf")); +} + +void tst_IsSemanticQuery::fileTypeImage() +{ + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "图片")); + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "截图")); +} + +void tst_IsSemanticQuery::fileTypeDocument() +{ + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "文档")); + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "报告")); +} + +void tst_IsSemanticQuery::locationDesktop() +{ + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "桌面的文件")); +} + +void tst_IsSemanticQuery::locationDownloads() +{ + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "下载的文件")); +} + +void tst_IsSemanticQuery::locationTrash() +{ + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "回收站的文件")); + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "删除的文件")); +} + +void tst_IsSemanticQuery::sizeLarge() +{ + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "大文件")); + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "几个G的文件")); +} + +void tst_IsSemanticQuery::sizeSmall() +{ + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "小文件")); +} + +void tst_IsSemanticQuery::sizeDynamic() +{ + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "大于500M的文件")); + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "小于100K")); + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "10M以上的表格")); + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "1G以内的文档")); +} + +void tst_IsSemanticQuery::timeAndFileType() +{ + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "今天的pdf")); + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "本周的图片")); +} + +void tst_IsSemanticQuery::locationAndTime() +{ + QVERIFY(checkIsSemanticQuery(m_engine, m_parser, "桌面今天的文件")); +} + +void tst_IsSemanticQuery::keywordOnlyNoMatch() +{ + // Text that does not match any semantic rule pattern + QVERIFY(!checkIsSemanticQuery(m_engine, m_parser, "xyzabc123")); + QVERIFY(!checkIsSemanticQuery(m_engine, m_parser, "随便什么文字")); +} + +void tst_IsSemanticQuery::consecutiveCalls() +{ + // Multiple calls with the same input should return consistent results + QString input = "今天的pdf"; + bool first = checkIsSemanticQuery(m_engine, m_parser, input); + bool second = checkIsSemanticQuery(m_engine, m_parser, input); + bool third = checkIsSemanticQuery(m_engine, m_parser, input); + QCOMPARE(first, second); + QCOMPARE(second, third); + QVERIFY(first); + + QString plain = "hello world"; + bool p1 = checkIsSemanticQuery(m_engine, m_parser, plain); + bool p2 = checkIsSemanticQuery(m_engine, m_parser, plain); + QCOMPARE(p1, p2); + QVERIFY(!p1); +} + +void tst_IsSemanticQuery::noiseWordsOnly() +{ + // Noise words alone (search action words) without any semantic dimension + QVERIFY(!checkIsSemanticQuery(m_engine, m_parser, "搜索")); + QVERIFY(!checkIsSemanticQuery(m_engine, m_parser, "查找")); +} + +// ===== tst_SearchTarget ===== + +class tst_SearchTarget : public QObject +{ + Q_OBJECT + +private Q_SLOTS: + void initTestCase(); + void defaultIsAll(); + void filenameContains(); + void filenameNamed(); + void contentContains(); + void genericContainsStaysAll(); + void unconsumedTextStaysAll(); + +private: + SemanticRuleEngine *m_engine = nullptr; + KeywordExtractor *m_extractor = nullptr; +}; + +void tst_SearchTarget::initTestCase() +{ + if (!sourceRulesAvailable()) { + QSKIP("Rule files not found in source tree, skipping search target tests"); + } + + m_engine = new SemanticRuleEngine(this); + const QString dir = sourceRulesDir(); + const QStringList ruleFiles = QDir(dir).entryList( + {"*.json"}, QDir::Files, QDir::Name); + for (const QString &filename : ruleFiles) { + QString path = dir + "/" + filename; + if (!m_engine->loadRuleFile(path)) { + qWarning() << "Failed to load rule file:" << path; + } + } + + m_extractor = new KeywordExtractor(m_engine); +} + +void tst_SearchTarget::defaultIsAll() +{ + ParsedIntent intent; + m_extractor->extract("蓝天白云", intent); + QCOMPARE(intent.searchTarget, SearchTarget::All); +} + +void tst_SearchTarget::filenameContains() +{ + ParsedIntent intent; + m_extractor->extract("文件名包含测试的文档", intent); + QCOMPARE(intent.searchTarget, SearchTarget::FileNameOnly); + QCOMPARE(intent.keywords.size(), 1); + QCOMPARE(intent.keywords.first(), QString("测试")); +} + +void tst_SearchTarget::filenameNamed() +{ + ParsedIntent intent; + m_extractor->extract("名为报告的文件", intent); + QCOMPARE(intent.searchTarget, SearchTarget::FileNameOnly); + QCOMPARE(intent.keywords.size(), 1); + QCOMPARE(intent.keywords.first(), QString("报告")); +} + +void tst_SearchTarget::contentContains() +{ + ParsedIntent intent; + m_extractor->extract("文件内容包含配置的文档", intent); + QCOMPARE(intent.searchTarget, SearchTarget::ContentOnly); + QCOMPARE(intent.keywords.size(), 1); + QCOMPARE(intent.keywords.first(), QString("配置")); +} + +void tst_SearchTarget::genericContainsStaysAll() +{ + ParsedIntent intent; + m_extractor->extract("包含测试的文件", intent); + QCOMPARE(intent.searchTarget, SearchTarget::All); + QVERIFY(!intent.keywords.isEmpty()); +} + +void tst_SearchTarget::unconsumedTextStaysAll() +{ + // No structured keyword rule matches → unconsumed text extraction + ParsedIntent intent; + m_extractor->extract("项目计划书", intent); + QCOMPARE(intent.searchTarget, SearchTarget::All); + QVERIFY(!intent.keywords.isEmpty()); +} + +// ===== tst_SemanticQueryBuilderTarget ===== + +class tst_SemanticQueryBuilderTarget : public QObject +{ + Q_OBJECT + +private Q_SLOTS: + void defaultTarget(); + void fileNameOnlyTarget(); + void contentOnlyTarget(); + +private: + ParsedIntent makeIntent(const QStringList &keywords, SearchTarget target) const; +}; + +ParsedIntent tst_SemanticQueryBuilderTarget::makeIntent( + const QStringList &keywords, SearchTarget target) const +{ + ParsedIntent intent; + intent.keywords = keywords; + intent.searchTarget = target; + return intent; +} + +void tst_SemanticQueryBuilderTarget::defaultTarget() +{ + SemanticQueryBuilder builder; + ParsedIntent intent = makeIntent({"测试"}, SearchTarget::All); + SemanticSearchPlan plan = builder.build(intent); + + // All three paths should produce queries + QVERIFY(!plan.fileNameQuery.keyword().isEmpty()); + QVERIFY(plan.contentQuery.has_value()); + QVERIFY(plan.ocrQuery.has_value()); +} + +void tst_SemanticQueryBuilderTarget::fileNameOnlyTarget() +{ + SemanticQueryBuilder builder; + ParsedIntent intent = makeIntent({"测试"}, SearchTarget::FileNameOnly); + SemanticSearchPlan plan = builder.build(intent); + + // Only filename query should be built + QVERIFY(!plan.fileNameQuery.keyword().isEmpty()); + QVERIFY(!plan.contentQuery.has_value()); + QVERIFY(!plan.ocrQuery.has_value()); +} + +void tst_SemanticQueryBuilderTarget::contentOnlyTarget() +{ + SemanticQueryBuilder builder; + ParsedIntent intent = makeIntent({"测试"}, SearchTarget::ContentOnly); + SemanticSearchPlan plan = builder.build(intent); + + // Filename should NOT be built; content and ocr should + QVERIFY(plan.fileNameQuery.keyword().isEmpty()); + QVERIFY(plan.contentQuery.has_value()); + QVERIFY(plan.ocrQuery.has_value()); +} + +// ===== Factory functions ===== + +QObject *create_tst_RuleEngine() { return new tst_RuleEngine(); } +QObject *create_tst_TimeExtraction() { return new tst_TimeExtraction(); } +QObject *create_tst_FileTypeExtraction() { return new tst_FileTypeExtraction(); } +QObject *create_tst_KeywordExtraction() { return new tst_KeywordExtraction(); } +QObject *create_tst_ParsedIntent() { return new tst_ParsedIntent(); } +QObject *create_tst_IsSemanticQuery() { return new tst_IsSemanticQuery(); } +QObject *create_tst_SearchTarget() { return new tst_SearchTarget(); } +QObject *create_tst_SemanticQueryBuilderTarget() { return new tst_SemanticQueryBuilderTarget(); } + +#include "tst_semantic_search.moc" diff --git a/autotests/dfm-search-tests/tst_size_range_filter.cpp b/autotests/dfm-search-tests/tst_size_range_filter.cpp new file mode 100644 index 00000000..ca361e22 --- /dev/null +++ b/autotests/dfm-search-tests/tst_size_range_filter.cpp @@ -0,0 +1,374 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#include +#include +#include +#include +#include + +#include "size_parser.h" + +using namespace DFMSEARCH; + +class tst_SizeRangeFilter : public QObject +{ + Q_OBJECT + +private Q_SLOTS: + // SizeRangeFilter tests + void testDefaultState(); + void testSetMin(); + void testSetMax(); + void testSetRange(); + void testFluentChaining(); + void testBoundaryControl(); + void testCopyConstructor(); + void testMoveConstructor(); + void testClear(); + void testIsValid(); + + // SizeParser tests + void testParseSizeBytes(); + void testParseSizeKilobytes(); + void testParseSizeMegabytes(); + void testParseSizeGigabytes(); + void testParseSizeTerabytes(); + void testParseSizeCaseInsensitive(); + void testParseSizeInvalid(); + void testParseSizeEmpty(); + void testParseSizeWithSpaces(); + + // SearchOptions integration + void testSearchOptionsSizeFilter(); + void testSearchOptionsClearSizeFilter(); + + // FileNameResultAPI integration + void testFileNameResultAPIFileSizeBytes(); +}; + +// ==================== SizeRangeFilter Tests ==================== + +void tst_SizeRangeFilter::testDefaultState() +{ + SizeRangeFilter filter; + QCOMPARE(filter.minSize(), 0); + QCOMPARE(filter.maxSize(), 0); + QCOMPARE(filter.includeLower(), true); + QCOMPARE(filter.includeUpper(), true); + QCOMPARE(filter.isValid(), false); +} + +void tst_SizeRangeFilter::testSetMin() +{ + SizeRangeFilter filter; + filter.setMin(1024); + + QCOMPARE(filter.minSize(), 1024); + QCOMPARE(filter.isValid(), true); +} + +void tst_SizeRangeFilter::testSetMax() +{ + SizeRangeFilter filter; + filter.setMax(10 * 1024 * 1024); + + QCOMPARE(filter.maxSize(), 10 * 1024 * 1024); + QCOMPARE(filter.isValid(), true); +} + +void tst_SizeRangeFilter::testSetRange() +{ + SizeRangeFilter filter; + filter.setRange(1024, 10 * 1024 * 1024); + + QCOMPARE(filter.minSize(), 1024); + QCOMPARE(filter.maxSize(), 10 * 1024 * 1024); + QCOMPARE(filter.isValid(), true); +} + +void tst_SizeRangeFilter::testFluentChaining() +{ + SizeRangeFilter filter; + auto &ref = filter.setMin(1024).setMax(10 * 1024 * 1024); + + QCOMPARE(filter.minSize(), 1024); + QCOMPARE(filter.maxSize(), 10 * 1024 * 1024); + QCOMPARE(&ref, &filter); // 返回自身的引用 +} + +void tst_SizeRangeFilter::testBoundaryControl() +{ + SizeRangeFilter filter; + filter.setRange(1024, 10 * 1024 * 1024); + filter.setIncludeLower(false); + filter.setIncludeUpper(false); + + QCOMPARE(filter.includeLower(), false); + QCOMPARE(filter.includeUpper(), false); +} + +void tst_SizeRangeFilter::testCopyConstructor() +{ + SizeRangeFilter original; + original.setRange(1024, 10 * 1024 * 1024); + original.setIncludeLower(false); + + SizeRangeFilter copy(original); + QCOMPARE(copy.minSize(), 1024); + QCOMPARE(copy.maxSize(), 10 * 1024 * 1024); + QCOMPARE(copy.includeLower(), false); + QCOMPARE(copy.includeUpper(), true); + QCOMPARE(copy.isValid(), true); +} + +void tst_SizeRangeFilter::testMoveConstructor() +{ + SizeRangeFilter original; + original.setRange(1024, 10 * 1024 * 1024); + + SizeRangeFilter moved(std::move(original)); + QCOMPARE(moved.minSize(), 1024); + QCOMPARE(moved.maxSize(), 10 * 1024 * 1024); + QCOMPARE(moved.isValid(), true); +} + +void tst_SizeRangeFilter::testClear() +{ + SizeRangeFilter filter; + filter.setRange(1024, 10 * 1024 * 1024); + filter.setIncludeLower(false); + filter.setIncludeUpper(false); + + filter.clear(); + + QCOMPARE(filter.minSize(), 0); + QCOMPARE(filter.maxSize(), 0); + QCOMPARE(filter.includeLower(), true); // 重置为默认值 + QCOMPARE(filter.includeUpper(), true); + QCOMPARE(filter.isValid(), false); +} + +void tst_SizeRangeFilter::testIsValid() +{ + SizeRangeFilter filter; + + // 默认状态无效 + QCOMPARE(filter.isValid(), false); + + // 设置 min 后有效 + filter.setMin(1); + QCOMPARE(filter.isValid(), true); + filter.clear(); + + // 设置 max 后有效 + filter.setMax(1); + QCOMPARE(filter.isValid(), true); + filter.clear(); + + // 设置 0 值仍无效 + filter.setMin(0); + QCOMPARE(filter.isValid(), false); + filter.setMax(0); + QCOMPARE(filter.isValid(), false); +} + +// ==================== SizeParser Tests ==================== + +void tst_SizeRangeFilter::testParseSizeBytes() +{ + qint64 bytes = 0; + + QVERIFY(dfmsearch::SizeParser::parseSize("512", bytes)); + QCOMPARE(bytes, 512); + + QVERIFY(dfmsearch::SizeParser::parseSize("0", bytes)); + QCOMPARE(bytes, 0); + + QVERIFY(dfmsearch::SizeParser::parseSize("1024", bytes)); + QCOMPARE(bytes, 1024); +} + +void tst_SizeRangeFilter::testParseSizeKilobytes() +{ + qint64 bytes = 0; + + QVERIFY(dfmsearch::SizeParser::parseSize("1K", bytes)); + QCOMPARE(bytes, 1024); + + QVERIFY(dfmsearch::SizeParser::parseSize("1KB", bytes)); + QCOMPARE(bytes, 1024); + + QVERIFY(dfmsearch::SizeParser::parseSize("10K", bytes)); + QCOMPARE(bytes, 10240); + + QVERIFY(dfmsearch::SizeParser::parseSize("1.5K", bytes)); + QCOMPARE(bytes, 1536); +} + +void tst_SizeRangeFilter::testParseSizeMegabytes() +{ + qint64 bytes = 0; + + QVERIFY(dfmsearch::SizeParser::parseSize("1M", bytes)); + QCOMPARE(bytes, 1024 * 1024); + + QVERIFY(dfmsearch::SizeParser::parseSize("1MB", bytes)); + QCOMPARE(bytes, 1024 * 1024); + + QVERIFY(dfmsearch::SizeParser::parseSize("10M", bytes)); + QCOMPARE(bytes, 10LL * 1024 * 1024); + + QVERIFY(dfmsearch::SizeParser::parseSize("1.5M", bytes)); + QCOMPARE(bytes, static_cast(1.5 * 1024 * 1024)); +} + +void tst_SizeRangeFilter::testParseSizeGigabytes() +{ + qint64 bytes = 0; + + QVERIFY(dfmsearch::SizeParser::parseSize("1G", bytes)); + QCOMPARE(bytes, 1024LL * 1024 * 1024); + + QVERIFY(dfmsearch::SizeParser::parseSize("1GB", bytes)); + QCOMPARE(bytes, 1024LL * 1024 * 1024); + + QVERIFY(dfmsearch::SizeParser::parseSize("10G", bytes)); + QCOMPARE(bytes, 10LL * 1024 * 1024 * 1024); +} + +void tst_SizeRangeFilter::testParseSizeTerabytes() +{ + qint64 bytes = 0; + + QVERIFY(dfmsearch::SizeParser::parseSize("1T", bytes)); + QCOMPARE(bytes, 1024LL * 1024 * 1024 * 1024); + + QVERIFY(dfmsearch::SizeParser::parseSize("1TB", bytes)); + QCOMPARE(bytes, 1024LL * 1024 * 1024 * 1024); +} + +void tst_SizeRangeFilter::testParseSizeCaseInsensitive() +{ + qint64 bytes = 0; + + QVERIFY(dfmsearch::SizeParser::parseSize("1k", bytes)); + QCOMPARE(bytes, 1024); + + QVERIFY(dfmsearch::SizeParser::parseSize("1m", bytes)); + QCOMPARE(bytes, 1024 * 1024); + + QVERIFY(dfmsearch::SizeParser::parseSize("1g", bytes)); + QCOMPARE(bytes, 1024LL * 1024 * 1024); + + QVERIFY(dfmsearch::SizeParser::parseSize("1kb", bytes)); + QCOMPARE(bytes, 1024); + + QVERIFY(dfmsearch::SizeParser::parseSize("1mb", bytes)); + QCOMPARE(bytes, 1024 * 1024); +} + +void tst_SizeRangeFilter::testParseSizeInvalid() +{ + qint64 bytes = 0; + + // 未知后缀 + QVERIFY(!dfmsearch::SizeParser::parseSize("1X", bytes)); + + // 纯字母 + QVERIFY(!dfmsearch::SizeParser::parseSize("abc", bytes)); + + // 负数 + QVERIFY(!dfmsearch::SizeParser::parseSize("-1K", bytes)); + + // 空数字 + QVERIFY(!dfmsearch::SizeParser::parseSize("K", bytes)); +} + +void tst_SizeRangeFilter::testParseSizeEmpty() +{ + qint64 bytes = -1; + + QVERIFY(!dfmsearch::SizeParser::parseSize("", bytes)); + QCOMPARE(bytes, -1); // 不应被修改 +} + +void tst_SizeRangeFilter::testParseSizeWithSpaces() +{ + qint64 bytes = 0; + + QVERIFY(dfmsearch::SizeParser::parseSize(" 1M ", bytes)); + QCOMPARE(bytes, 1024 * 1024); + + QVERIFY(dfmsearch::SizeParser::parseSize(" 512 ", bytes)); + QCOMPARE(bytes, 512); +} + +// ==================== SearchOptions Integration Tests ==================== + +void tst_SizeRangeFilter::testSearchOptionsSizeFilter() +{ + SearchOptions options; + + // 默认无大小过滤 + QCOMPARE(options.hasSizeRangeFilter(), false); + + // 设置大小过滤 + SizeRangeFilter filter; + filter.setRange(1024, 10 * 1024 * 1024); + options.setSizeRangeFilter(filter); + + QCOMPARE(options.hasSizeRangeFilter(), true); + + SizeRangeFilter retrieved = options.sizeRangeFilter(); + QCOMPARE(retrieved.minSize(), 1024); + QCOMPARE(retrieved.maxSize(), 10 * 1024 * 1024); +} + +void tst_SizeRangeFilter::testSearchOptionsClearSizeFilter() +{ + SearchOptions options; + + SizeRangeFilter filter; + filter.setRange(1024, 10 * 1024 * 1024); + options.setSizeRangeFilter(filter); + QCOMPARE(options.hasSizeRangeFilter(), true); + + options.clearSizeRangeFilter(); + QCOMPARE(options.hasSizeRangeFilter(), false); + + SizeRangeFilter retrieved = options.sizeRangeFilter(); + QCOMPARE(retrieved.minSize(), 0); + QCOMPARE(retrieved.maxSize(), 0); +} + +// ==================== FileNameResultAPI Integration Tests ==================== + +void tst_SizeRangeFilter::testFileNameResultAPIFileSizeBytes() +{ + SearchResult result("/home/user/test.txt"); + FileNameResultAPI api(result); + + // 默认值 + QCOMPARE(api.fileSizeBytes(), 0); + + // 设置和获取 + api.setFileSizeBytes(1024); + QCOMPARE(api.fileSizeBytes(), 1024); + + // 设置大文件 + api.setFileSizeBytes(10LL * 1024 * 1024 * 1024); + QCOMPARE(api.fileSizeBytes(), 10LL * 1024 * 1024 * 1024); + + // 设置 0 + api.setFileSizeBytes(0); + QCOMPARE(api.fileSizeBytes(), 0); +} + +QObject *create_tst_SizeRangeFilter() +{ + return new tst_SizeRangeFilter(); +} + +#include "tst_size_range_filter.moc" diff --git a/debian/changelog b/debian/changelog index 2712ec58..64e0be13 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,45 @@ +util-dfm (1.3.57) unstable; urgency=medium + + * fix: change search worker connection to direct + * feat: add filename search to content and OCR search + * feat: add natural language semantic search + * feat: add relative time support for Chinese search + * feat: implement file size range filtering + * feat: add file size constraint support in semantic search + * feat: add action-based time field search support + * fix: automatically handle hidden path search conditions + * feat: add location-based search support for Chinese NLP + * feat: add semantic query detection and multi-path search support + * feat: add file size range filter to search strategies + * fix: unify dfm-search library and path names + * feat: add file metadata attributes to search results + * fix: improve Chinese NLP search functionality + * feat: add semantic search with detailed results + * feat: enhance semantic search with explicit directories + * feat: add max results limit for semantic search + * test: add search target control tests + * feat: add chinese NLP parsing for relative time and size constraints + * feat: add NGram analyzer and tokenizer for Lucene++ + * fix: improve content search engine validation and analyzer + * refactor: optimize search filtering and query building + * feat: add on-demand content highlight retrieval + * refactor: improve NGramTokenizer and search factory + * refactor: improve OCR text search validation and analyzer selection + * perf: optimize search performance with field selector + * refactor: disable unit tests in release builds + * feat: optimize ngram search query building + * refactor: remove NGram analyzer and tokenizer components + * fix: adjust N-gram token position calculation + * feat: enhance ContentRetriever with content fetching capabilities + * test: add test utility libraries for content search + * perf: optimize OCR text search document loading + * perf: replace chinese analyzer with ngram search + * test: add filename search engine test cases + * docs: update license files and cleanup + * refactor(mount): use memfd instead of pipe for password transfer + + -- Zhang Sheng Thu, 28 May 2026 14:03:12 +0800 + util-dfm (1.3.56) unstable; urgency=medium * perf: cache resolved indexed directories to avoid repeated resolution diff --git a/debian/libdfm-search.install b/debian/libdfm-search.install index d0fd2344..01f76db8 100644 --- a/debian/libdfm-search.install +++ b/debian/libdfm-search.install @@ -1,2 +1,3 @@ usr/lib/*/libdfm-search*.so* -usr/bin/dfm-searcher \ No newline at end of file +usr/bin/dfm-searcher +usr/share/deepin/dfm-search/semantic/rules/* \ No newline at end of file diff --git a/debian/libdfm6-search.install b/debian/libdfm6-search.install index 9c76e128..c488f7e4 100644 --- a/debian/libdfm6-search.install +++ b/debian/libdfm6-search.install @@ -1,2 +1,3 @@ usr/lib/*/libdfm6-search*.so* -usr/bin/dfm-searcher \ No newline at end of file +usr/bin/dfm-searcher +usr/share/deepin/dfm-search/semantic/rules/* \ No newline at end of file diff --git a/include/dfm-search/dfm-search/contentretriever.h b/include/dfm-search/dfm-search/contentretriever.h new file mode 100644 index 00000000..673f38e6 --- /dev/null +++ b/include/dfm-search/dfm-search/contentretriever.h @@ -0,0 +1,114 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later +#ifndef CONTENTRETRIEVER_H +#define CONTENTRETRIEVER_H + +#include +#include +#include +#include + +#include + +DFM_SEARCH_BEGIN_NS + +/** + * @brief Lightweight options for highlight extraction + */ +struct HighlightOptions +{ + int maxPreviewLength = 200; ///< Maximum snippet length in characters + bool enableHtml = false; ///< Wrap matched keywords with tags +}; + +/** + * @brief Retrieves highlighted content from Lucene index on demand + * + * Provides a standalone mechanism to fetch highlighted content snippets + * for specific file paths without running a full search pipeline. + * + * Typical usage pattern: + * 1. Perform a search with isFullTextRetrievalEnabled() = false (fast) + * 2. Display path-only results to the user immediately + * 3. On demand (e.g., scroll into view), call fetchHighlight() per path + * + * This decouples highlight extraction from the search pipeline, + * enabling lazy-loading similar to thumbnail fetching. + */ +class ContentRetriever : public QObject +{ + Q_OBJECT + +public: + explicit ContentRetriever(QObject *parent = nullptr); + ~ContentRetriever() override; + + /** + * @brief Override the Lucene index directory for a given text search type. + * + * When @p indexDirectory is empty, the default global index directory for + * the given type will be used. This is primarily useful for tests or + * isolated business scenarios that need to point at a temporary index. + */ + void setIndexDirectory(SearchType type, const QString &indexDirectory); + + /** + * @brief Return the effective index directory for the given text search type. + */ + QString indexDirectory(SearchType type) const; + + /** + * @brief Synchronously fetch highlighted content for a single file + * + * Opens the Lucene index, locates the document by path, + * extracts stored text, and runs ContentHighlighter to produce + * a highlighted snippet. + * + * @param path Absolute file path + * @param keyword Search keyword (supports comma-separated for multi-keyword) + * @param type SearchType::Content or SearchType::Ocr + * @param options Highlight configuration (preview length, HTML toggle) + * @return Highlighted snippet, or empty string if not found + */ + QString fetchHighlight(const QString &path, + const QString &keyword, + SearchType type, + const HighlightOptions &options = {}) const; + + /** + * @brief Synchronously fetch highlights for multiple files + * @return Mapping of path -> highlighted content (empty string if not found) + */ + QMap fetchHighlights(const QStringList &paths, + const QString &keyword, + SearchType type, + const HighlightOptions &options = {}) const; + + /** + * @brief Synchronously fetch full stored content for a single file + * + * Opens the Lucene index, locates the document by path, + * and returns the full stored content field. + * + * @param path Absolute file path + * @param type SearchType::Content or SearchType::Ocr + * @return Full content text, or empty string if not found + */ + QString fetchContent(const QString &path, SearchType type) const; + + /** + * @brief Synchronously fetch full stored contents for multiple files + * @return Mapping of path -> full content (empty string if not found) + */ + QMap fetchContents(const QStringList &paths, + SearchType type) const; + +private: + struct Private; + std::unique_ptr d; +}; + +DFM_SEARCH_END_NS + +#endif // CONTENTRETRIEVER_H diff --git a/include/dfm-search/dfm-search/dimensionextractor.h b/include/dfm-search/dfm-search/dimensionextractor.h new file mode 100644 index 00000000..c798a14f --- /dev/null +++ b/include/dfm-search/dfm-search/dimensionextractor.h @@ -0,0 +1,32 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef DIMENSIONEXTRACTOR_H +#define DIMENSIONEXTRACTOR_H + +#include + +DFM_SEARCH_BEGIN_NS + +class DimensionExtractor +{ +public: + virtual ~DimensionExtractor() = default; + + /** + * @brief Extract a dimension from the input text and populate the intent. + * @param input The raw natural language input + * @param intent The intent to populate with extracted data + */ + virtual void extract(const QString &input, ParsedIntent &intent) = 0; + + /** + * @brief Get the name of this extractor for debugging. + */ + virtual QString name() const = 0; +}; + +DFM_SEARCH_END_NS + +#endif // DIMENSIONEXTRACTOR_H diff --git a/include/dfm-search/dfm-search/dsearch_global.h b/include/dfm-search/dfm-search/dsearch_global.h index f18892cd..f03a064b 100644 --- a/include/dfm-search/dfm-search/dsearch_global.h +++ b/include/dfm-search/dfm-search/dsearch_global.h @@ -203,6 +203,7 @@ enum SearchType { FileName, // Search by file name Content, // Search by content within files Ocr, // Search by OCR-extracted text from images + Semantic = 40, // Semantic / natural-language search (launches sub-engines internally) Custom = 50 // User-defined search type }; Q_ENUM_NS(SearchType) @@ -227,8 +228,10 @@ Q_ENUM_NS(SearchMethod) // Enumeration for time field type enum class TimeField { - BirthTime, // File creation time - ModifyTime // File modification time + Unspecified, // No specific time field (search both BirthTime and ModifyTime) + BirthTime, // File creation time + ModifyTime, // File modification time + Both // Search both BirthTime and ModifyTime (union of results) }; Q_ENUM_NS(TimeField) diff --git a/include/dfm-search/dfm-search/field_names.h b/include/dfm-search/dfm-search/field_names.h index 2cdbf9f8..d4fb833f 100644 --- a/include/dfm-search/dfm-search/field_names.h +++ b/include/dfm-search/dfm-search/field_names.h @@ -21,6 +21,7 @@ constexpr const wchar_t kFullPath[] = L"full_path"; constexpr const wchar_t kIsHidden[] = L"is_hidden"; constexpr const wchar_t kModifyTime[] = L"modify_time"; constexpr const wchar_t kBirthTime[] = L"birth_time"; +constexpr const wchar_t kFileSize[] = L"file_size"; constexpr const wchar_t kFileSizeStr[] = L"file_size_str"; constexpr const wchar_t kPinyin[] = L"pinyin"; constexpr const wchar_t kPinyinAcronym[] = L"pinyin_acronym"; @@ -37,6 +38,7 @@ constexpr const wchar_t kAncestorPaths[] = L"ancestor_paths"; constexpr const wchar_t kBirthTime[] = L"birth_time"; constexpr const wchar_t kModifyTime[] = L"modify_time"; constexpr const wchar_t kFileSize[] = L"file_size"; +constexpr const wchar_t kCheckSum[] = L"checksum"; } // namespace Content // OCR text index field names diff --git a/include/dfm-search/dfm-search/filenamesearchapi.h b/include/dfm-search/dfm-search/filenamesearchapi.h index 4af8fba7..4093f34d 100644 --- a/include/dfm-search/dfm-search/filenamesearchapi.h +++ b/include/dfm-search/dfm-search/filenamesearchapi.h @@ -237,6 +237,20 @@ class FileNameResultAPI */ QString birthTimeString() const; + // ==================== File Size (Numeric) ==================== + + /** + * @brief Set the file size in bytes + * @param bytes File size in bytes + */ + void setFileSizeBytes(qint64 bytes); + + /** + * @brief Get the file size in bytes + * @return File size in bytes, 0 if not set + */ + qint64 fileSizeBytes() const; + private: SearchResult &m_result; }; diff --git a/include/dfm-search/dfm-search/ocrtextsearchapi.h b/include/dfm-search/dfm-search/ocrtextsearchapi.h index 51ef0207..ff8c9298 100644 --- a/include/dfm-search/dfm-search/ocrtextsearchapi.h +++ b/include/dfm-search/dfm-search/ocrtextsearchapi.h @@ -75,6 +75,18 @@ class OcrTextResultAPI : public TextSearchResultAPI * @param content The OCR extracted text to set */ void setOcrContent(const QString &content); + + /** + * @brief Get the file checksum + * @return The checksum string, or empty if not set + */ + QString checksum() const; + + /** + * @brief Set the file checksum + * @param checksum The checksum string to set + */ + void setChecksum(const QString &checksum); }; DFM_SEARCH_END_NS diff --git a/include/dfm-search/dfm-search/searchoptions.h b/include/dfm-search/dfm-search/searchoptions.h index 6d22b5c9..f9ae1dce 100644 --- a/include/dfm-search/dfm-search/searchoptions.h +++ b/include/dfm-search/dfm-search/searchoptions.h @@ -10,6 +10,7 @@ #include #include +#include DFM_SEARCH_BEGIN_NS @@ -74,14 +75,41 @@ class SearchOptions /** * @brief Get the starting search path + * + * Returns the first path from searchPaths(), or an empty string if no paths are set. + * @return The primary search path */ QString searchPath() const; /** * @brief Set the starting search path + * + * This replaces all search paths with a single path. + * @param path The search path */ void setSearchPath(const QString &path); + /** + * @brief Get all search paths + * + * When multiple paths are set, the search engine will search all of them + * and combine results. Returns a list containing the single searchPath + * if only one path was set via setSearchPath(). + * + * @return List of search paths + */ + QStringList searchPaths() const; + + /** + * @brief Set multiple search paths + * + * When multiple paths are set, search engines that support multi-path + * queries will build combined path prefix queries internally. + * + * @param paths List of search paths + */ + void setSearchPaths(const QStringList &paths); + /** * @brief Returns the current list of excluded search paths. * @@ -254,6 +282,40 @@ class SearchOptions */ void clearTimeRangeFilter(); + /** + * @brief Sets the file size range filter for search operations. + * + * The size range filter allows filtering search results based on file size in bytes. + * Supports setting minimum and/or maximum file size boundaries. + * + * @param filter The SizeRangeFilter to apply + * @sa sizeRangeFilter(), hasSizeRangeFilter(), clearSizeRangeFilter() + */ + void setSizeRangeFilter(const SizeRangeFilter &filter); + + /** + * @brief Returns the current file size range filter. + * + * @return The current SizeRangeFilter + * @sa setSizeRangeFilter() + */ + SizeRangeFilter sizeRangeFilter() const; + + /** + * @brief Checks if a file size range filter is set. + * + * @return true if a valid size range filter is set, false otherwise + * @sa setSizeRangeFilter(), clearSizeRangeFilter() + */ + bool hasSizeRangeFilter() const; + + /** + * @brief Clears the file size range filter. + * + * @sa setSizeRangeFilter(), hasSizeRangeFilter() + */ + void clearSizeRangeFilter(); + private: std::unique_ptr d; // PIMPL }; diff --git a/include/dfm-search/dfm-search/semantic_types.h b/include/dfm-search/dfm-search/semantic_types.h new file mode 100644 index 00000000..520aea03 --- /dev/null +++ b/include/dfm-search/dfm-search/semantic_types.h @@ -0,0 +1,116 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef SEMANTIC_TYPES_H +#define SEMANTIC_TYPES_H + +#include +#include +#include +#include + +#include + +DFM_SEARCH_BEGIN_NS + +/** + * @brief Represents a consumed span in the input text matched by a rule. + */ +struct MatchSpan +{ + int start = -1; + int end = -1; + QString ruleId; + + bool isValid() const { return start >= 0 && end > start; } +}; + +/** + * @brief Enum for preset time periods. + */ +enum class TimePreset { + Today, + Yesterday, + DayBeforeYesterday, + ThisWeek, + LastWeek, + ThisMonth, + LastMonth, + ThisYear, + LastYear +}; + +/** + * @brief Enum for time constraint kinds. + */ +enum class TimeConstraintKind { + None, ///< No time constraint + Preset, ///< Preset period (today, yesterday, etc.) + Relative, ///< Relative time (last N days/hours) + Custom ///< Custom datetime range +}; + +/** + * @brief Represents a parsed time constraint from natural language. + */ +struct TimeConstraint +{ + TimeConstraintKind kind = TimeConstraintKind::None; + TimePreset preset = TimePreset::Today; + int relativeValue = 0; + TimeUnit relativeUnit = TimeUnit::Days; + QDateTime customStart; + QDateTime customEnd; + TimeField timeField = TimeField::Unspecified; // Set by ActionExtractor; Unspecified = no action specified + + bool isValid() const { return kind != TimeConstraintKind::None; } +}; + +/** + * @brief Represents a parsed size constraint from natural language. + */ +struct SizeConstraint +{ + qint64 minSize = 0; // Minimum size in bytes (0 = no lower bound) + qint64 maxSize = 0; // Maximum size in bytes (0 = no upper bound) + bool includeLower = true; + bool includeUpper = true; + + bool isValid() const { return minSize > 0 || maxSize > 0; } +}; + +/** + * @brief Represents the target scope for a semantic search. + * + * When a user explicitly specifies where to search (e.g. "文件名包含XX" + * vs "文件内容包含XX"), this enum controls which search paths are enabled. + */ +enum class SearchTarget { + All, ///< All search paths enabled (default) + FileNameOnly, ///< Only filename search + ContentOnly ///< Only content + OCR search +}; + +/** + * @brief Represents the parsed intent from natural language input. + * + * This is the intermediate representation between NLP parsing + * and search query construction. Declared public for future + * structured API extensibility. + */ +struct ParsedIntent +{ + TimeConstraint timeConstraint; + SizeConstraint sizeConstraint; + QStringList fileExtensions; + QStringList searchDirectories; // Absolute paths resolved from location words + bool includeHidden = false; // true for trash (hidden directory) + QStringList keywords; + SearchTarget searchTarget = SearchTarget::All; + QList consumedSpans; +}; + +DFM_SEARCH_END_NS + +#endif // SEMANTIC_TYPES_H diff --git a/include/dfm-search/dfm-search/semanticsearcher.h b/include/dfm-search/dfm-search/semanticsearcher.h new file mode 100644 index 00000000..10e81a8e --- /dev/null +++ b/include/dfm-search/dfm-search/semanticsearcher.h @@ -0,0 +1,209 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef SEMANTICSEARCHER_H +#define SEMANTICSEARCHER_H + +#include + +#include +#include +#include +#include + +DFM_SEARCH_BEGIN_NS + +class SemanticSearcherData; + +/** + * @brief The SemanticSearcher class provides natural language based file search. + * + * This class parses natural language queries (e.g., "today's pdf documents") + * into structured search conditions, then orchestrates parallel searches + * across filename, content, and OCR indexes. + * + * Usage: + * @code + * SemanticSearcher *searcher = new SemanticSearcher(this); + * connect(searcher, &SemanticSearcher::resultsFound, [](const SearchResultList &results) { + * for (const auto &r : results) { + * qDebug() << r.path(); + * } + * }); + * searcher->search("today's pdf documents"); + * @endcode + */ +class SemanticSearcher : public QObject +{ + Q_OBJECT + +public: + /** + * @brief Construct a semantic searcher + * @param parent Parent QObject + */ + explicit SemanticSearcher(QObject *parent = nullptr); + + /** + * @brief Destructor + */ + ~SemanticSearcher() override; + + /** + * @brief Get the current search status + */ + SearchStatus status() const; + + /** + * @brief Set the search timeout in seconds + * @param seconds Timeout duration (default 60, 0 to disable) + */ + void setSearchTimeout(int seconds); + + /** + * @brief Get the search timeout in seconds + */ + int searchTimeout() const; + + /** + * @brief Perform a semantic search with natural language input + * @param naturalLanguage The natural language query string + */ + void search(const QString &naturalLanguage); + + /** + * @brief Perform a semantic search with explicit search directories + * + * When @p searchDirectories is non-empty, those directories take priority + * over any directories resolved from the natural language input. + * If empty, falls back to NLP-parsed directories, then home directory. + * + * @param naturalLanguage The natural language query string + * @param searchDirectories Explicit directories to search in + */ + void search(const QString &naturalLanguage, const QStringList &searchDirectories); + + /** + * @brief Check if the input contains semantic intent beyond a plain keyword. + * + * Returns true if parsing the input reveals time constraints, size constraints, + * file type filters, or location constraints. Returns false for plain keyword input. + * + * This allows callers to avoid unnecessary semantic search overhead when + * the user is just typing a simple keyword. + * + * @param input The natural language query to check + * @return true if the input contains semantic intent, false for plain keywords + */ + bool isSemanticQuery(const QString &input) const; + + /** + * @brief Perform a synchronous semantic search + * + * Blocks the calling thread until all search engines complete or timeout. + * Uses QEventLoop internally, so it works from the GUI thread. + * @param naturalLanguage The natural language query string + * @return SearchResultExpected containing deduplicated results or an error + */ + SearchResultExpected searchSync(const QString &naturalLanguage); + + /** + * @brief Perform a synchronous semantic search with explicit directories + * @param naturalLanguage The natural language query string + * @param searchDirectories Explicit directories to search in + * @return SearchResultExpected containing deduplicated results or an error + */ + SearchResultExpected searchSync(const QString &naturalLanguage, const QStringList &searchDirectories); + + /** + * @brief Cancel the current search operation + */ + void cancel(); + + /** + * @brief Enable or disable detailed results for sub-engines + * + * When enabled, each sub-engine (FileName, Content, OCR) will populate + * extra metadata fields (file type, size, timestamps, etc.) in results. + * Must be called before search(). + * + * @param enable true to enable detailed results (default false) + */ + void setDetailedResultsEnabled(bool enable); + + /** + * @brief Check whether detailed results are enabled + */ + bool isDetailedResultsEnabled() const; + + /** + * @brief Set the maximum number of results to return + * + * Each sub-engine (FileName, Content, OCR) will be limited to this count. + * After all engines finish, results are deduplicated and truncated + * to this count. + * + * @param count Maximum result count (0 = unlimited, default 0) + */ + void setMaxResults(int count); + + /** + * @brief Get the maximum number of results + * @return Maximum result count (0 = unlimited) + */ + int maxResults() const; + +Q_SIGNALS: + /** + * @brief Emitted after the natural language input is parsed into an intent + * + * This fires before searchStarted(), allowing callers to inspect + * what the NLP parser understood from the input. + * + * @param intent The parsed intent structure + */ + void intentParsed(const DFMSEARCH::ParsedIntent &intent); + + /** + * @brief Emitted when a search operation starts + */ + void searchStarted(); + + /** + * @brief Emitted when search results are found + * @param results The found search results + */ + void resultsFound(const DFMSEARCH::SearchResultList &results); + + /** + * @brief Emitted when the search status changes + * @param status The new search status + */ + void statusChanged(SearchStatus status); + + /** + * @brief Emitted when a search operation completes + * @param results The list of all search results (deduplicated) + */ + void searchFinished(const DFMSEARCH::SearchResultList &results); + + /** + * @brief Emitted when a search operation is cancelled + */ + void searchCancelled(); + + /** + * @brief Emitted when an error occurs during search + * @param error The SearchError that occurred + */ + void errorOccurred(const DFMSEARCH::SearchError &error); + +private: + Q_DISABLE_COPY(SemanticSearcher) + std::unique_ptr d_ptr; +}; + +DFM_SEARCH_END_NS + +#endif // SEMANTICSEARCHER_H diff --git a/include/dfm-search/dfm-search/sizerangefilter.h b/include/dfm-search/dfm-search/sizerangefilter.h new file mode 100644 index 00000000..57d15f02 --- /dev/null +++ b/include/dfm-search/dfm-search/sizerangefilter.h @@ -0,0 +1,131 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later +#ifndef SIZERANGEFILTER_H +#define SIZERANGEFILTER_H + +#include + +DFM_SEARCH_BEGIN_NS + +class SizeRangeFilterData; + +/** + * @brief The SizeRangeFilter class provides file size range filtering for search operations. + * + * This class provides a fluent interface for specifying file size ranges. + * Size values are in bytes. + * + * Example usage: + * @code + * // Files between 1KB and 10MB + * SizeRangeFilter filter; + * filter.setRange(1024, 10 * 1024 * 1024); + * + * // Files larger than 1MB (including 1MB) + * SizeRangeFilter filter; + * filter.setMin(1024 * 1024).setIncludeLower(true); + * + * // Files smaller than 100KB (excluding 100KB) + * SizeRangeFilter filter; + * filter.setMax(100 * 1024).setIncludeUpper(false); + * @endcode + */ +class SizeRangeFilter +{ +public: + SizeRangeFilter(); + SizeRangeFilter(const SizeRangeFilter &other); + SizeRangeFilter(SizeRangeFilter &&other) noexcept; + ~SizeRangeFilter(); + + SizeRangeFilter &operator=(const SizeRangeFilter &other); + SizeRangeFilter &operator=(SizeRangeFilter &&other) noexcept; + + // ---------- Range Setting ---------- + + /** + * @brief Set the minimum file size in bytes + * @param minSize Minimum file size (0 means no lower bound) + * @return Reference to this filter for method chaining + */ + SizeRangeFilter &setMin(qint64 minSize); + + /** + * @brief Set the maximum file size in bytes + * @param maxSize Maximum file size (0 means no upper bound) + * @return Reference to this filter for method chaining + */ + SizeRangeFilter &setMax(qint64 maxSize); + + /** + * @brief Set both min and max file size in bytes + * @param minSize Minimum file size (0 means no lower bound) + * @param maxSize Maximum file size (0 means no upper bound) + * @return Reference to this filter for method chaining + */ + SizeRangeFilter &setRange(qint64 minSize, qint64 maxSize); + + // ---------- Accessors ---------- + + /** + * @brief Get the minimum file size + * @return Minimum file size in bytes (0 means no lower bound) + */ + qint64 minSize() const; + + /** + * @brief Get the maximum file size + * @return Maximum file size in bytes (0 means no upper bound) + */ + qint64 maxSize() const; + + // ---------- Boundary Control ---------- + + /** + * @brief Set whether the lower bound is inclusive + * @param include true to include the lower bound (default: true) + * @return Reference to this filter for method chaining + */ + SizeRangeFilter &setIncludeLower(bool include); + + /** + * @brief Set whether the upper bound is inclusive + * @param include true to include the upper bound (default: true) + * @return Reference to this filter for method chaining + */ + SizeRangeFilter &setIncludeUpper(bool include); + + /** + * @brief Check if lower bound is inclusive + * @return true if lower bound is inclusive + */ + bool includeLower() const; + + /** + * @brief Check if upper bound is inclusive + * @return true if upper bound is inclusive + */ + bool includeUpper() const; + + // ---------- Filter State ---------- + + /** + * @brief Clear the filter (make it invalid) + * @return Reference to this filter for method chaining + */ + SizeRangeFilter &clear(); + + /** + * @brief Check if the filter is valid (has at least one bound set) + * @return true if min or max is set (> 0) + */ + bool isValid() const; + +private: + std::unique_ptr d; +}; + +DFM_SEARCH_END_NS + +#endif // SIZERANGEFILTER_H diff --git a/include/dfm-search/dfm-search/textsearchapi.h b/include/dfm-search/dfm-search/textsearchapi.h index 3f52e8be..c9ce5319 100644 --- a/include/dfm-search/dfm-search/textsearchapi.h +++ b/include/dfm-search/dfm-search/textsearchapi.h @@ -75,6 +75,25 @@ class TextSearchOptionsAPI */ bool isFullTextRetrievalEnabled() const; + // ==================== Filename Search ==================== + + /** + * @brief Sets a keyword to search on the filename field. + * + * When set, the search will also match against the indexed filename field + * in addition to (or instead of) the content field. If both a content keyword + * and a filename keyword are provided, results must match both (AND logic). + * + * @param keyword The filename keyword to search for. + */ + void setFilenameKeyword(const QString &keyword); + + /** + * @brief Gets the filename keyword for search. + * @return The filename keyword, or empty string if not set. + */ + QString filenameKeyword() const; + protected: SearchOptions &m_options; }; @@ -108,6 +127,20 @@ class TextSearchResultAPI */ void setHighlightedContent(const QString &content); + // ==================== File Size ==================== + + /** + * @brief Set the file size in bytes + * @param bytes File size in bytes + */ + void setFileSizeBytes(qint64 bytes); + + /** + * @brief Get the file size in bytes + * @return File size in bytes, 0 if not set + */ + qint64 fileSizeBytes() const; + // ==================== Extended Attributes ==================== /** diff --git a/src/dfm-search/3rdparty/fulltext/chineseanalyzer.cpp b/src/dfm-search/3rdparty/fulltext/chineseanalyzer.cpp deleted file mode 100644 index dfe14a1a..00000000 --- a/src/dfm-search/3rdparty/fulltext/chineseanalyzer.cpp +++ /dev/null @@ -1,49 +0,0 @@ -///////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2009-2014 Alan Wright. All rights reserved. -// Distributable under the terms of either the Apache License (Version 2.0) -// or the GNU Lesser General Public License. -///////////////////////////////////////////////////////////////////////////// - -#include "chineseanalyzer.h" -#include "chinesetokenizer.h" - -#include -#include - -#define UNUSED(x) (void)x; - -namespace Lucene { - -ChineseAnalyzer::~ChineseAnalyzer() -{ -} - -TokenStreamPtr ChineseAnalyzer::tokenStream(const String &fieldName, const ReaderPtr &reader) -{ - UNUSED(fieldName) - - TokenStreamPtr result = newLucene(reader); - result = newLucene(result); - return result; -} - -TokenStreamPtr ChineseAnalyzer::reusableTokenStream(const String &fieldName, const ReaderPtr &reader) -{ - UNUSED(fieldName) - - ChineseAnalyzerSavedStreamsPtr streams(boost::dynamic_pointer_cast(getPreviousTokenStream())); - if (!streams) { - streams = newLucene(); - streams->source = newLucene(reader); - setPreviousTokenStream(streams); - } else { - streams->source->reset(reader); - } - return streams->source; -} - -ChineseAnalyzerSavedStreams::~ChineseAnalyzerSavedStreams() -{ -} - -} diff --git a/src/dfm-search/3rdparty/fulltext/chineseanalyzer.h b/src/dfm-search/3rdparty/fulltext/chineseanalyzer.h deleted file mode 100644 index 32bd3f2c..00000000 --- a/src/dfm-search/3rdparty/fulltext/chineseanalyzer.h +++ /dev/null @@ -1,53 +0,0 @@ -///////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2009-2014 Alan Wright. All rights reserved. -// Distributable under the terms of either the Apache License (Version 2.0) -// or the GNU Lesser General Public License. -///////////////////////////////////////////////////////////////////////////// - -#ifndef CHINESEANALYZER_H -#define CHINESEANALYZER_H - -#include -#include - -namespace Lucene { - -/** - * An Analyzer that tokenizes text with ChineseTokenizer - * Only used for Lucene++ - */ -class LPPCONTRIBAPI ChineseAnalyzer : public Analyzer -{ -public: - virtual ~ChineseAnalyzer(); - - LUCENE_CLASS(ChineseAnalyzer); - -public: - /// Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}. - /// - /// @return A {@link TokenStream} built from {@link ChineseTokenizer}, filtered with {@link ChineseFilter} - virtual TokenStreamPtr tokenStream(const String &fieldName, const ReaderPtr &reader); - - /// Returns a (possibly reused) {@link TokenStream} which tokenizes all the text in the - /// provided {@link Reader}. - /// - /// @return A {@link TokenStream} built from {@link ChineseTokenizer}, filtered with {@link ChineseFilter} - virtual TokenStreamPtr reusableTokenStream(const String &fieldName, const ReaderPtr &reader); -}; - -class LPPCONTRIBAPI ChineseAnalyzerSavedStreams : public LuceneObject -{ -public: - virtual ~ChineseAnalyzerSavedStreams(); - - LUCENE_CLASS(ChineseAnalyzerSavedStreams); - -public: - TokenizerPtr source; - TokenStreamPtr result; -}; - -} - -#endif // CHINESEANALYZER_H diff --git a/src/dfm-search/3rdparty/fulltext/chinesetokenizer.cpp b/src/dfm-search/3rdparty/fulltext/chinesetokenizer.cpp deleted file mode 100644 index addb9203..00000000 --- a/src/dfm-search/3rdparty/fulltext/chinesetokenizer.cpp +++ /dev/null @@ -1,130 +0,0 @@ -///////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2009-2014 Alan Wright. All rights reserved. -// Distributable under the terms of either the Apache License (Version 2.0) -// or the GNU Lesser General Public License. -///////////////////////////////////////////////////////////////////////////// - -#include -#include -#include -#include -#include -#include -#include - -#include "chinesetokenizer.h" - -namespace Lucene { - -const int32_t ChineseTokenizer::kMaxWordLen = 255; -const int32_t ChineseTokenizer::kIoBufferSize = 1024; - -ChineseTokenizer::ChineseTokenizer(const ReaderPtr &input) - : Tokenizer(input) -{ -} - -ChineseTokenizer::ChineseTokenizer(const AttributeSourcePtr &source, const ReaderPtr &input) - : Tokenizer(source, input) -{ -} - -ChineseTokenizer::ChineseTokenizer(const AttributeFactoryPtr &factory, const ReaderPtr &input) - : Tokenizer(factory, input) -{ -} - -ChineseTokenizer::~ChineseTokenizer() -{ -} - -void ChineseTokenizer::initialize() -{ - offset = 0; - bufferIndex = 0; - dataLen = 0; - buffer = CharArray::newInstance(kMaxWordLen); - memset(buffer.get(), 0, kMaxWordLen); - ioBuffer = CharArray::newInstance(kIoBufferSize); - memset(ioBuffer.get(), 0, kIoBufferSize); - length = 0; - start = 0; - - termAtt = addAttribute(); - offsetAtt = addAttribute(); -} - -void ChineseTokenizer::push(wchar_t c) -{ - if (length == 0) { - start = offset - 1; // start of token - } - buffer[length++] = CharFolder::toLower(c); // buffer it -} - -bool ChineseTokenizer::flush() -{ - if (length > 0) { - termAtt->setTermBuffer(buffer.get(), 0, length); - offsetAtt->setOffset(correctOffset(start), correctOffset(start + length)); - return true; - } else { - return false; - } -} - -bool ChineseTokenizer::incrementToken() -{ - clearAttributes(); - - length = 0; - start = offset; - - while (true) { - wchar_t c; - ++offset; - - if (bufferIndex >= dataLen) { - dataLen = input->read(ioBuffer.get(), 0, ioBuffer.size()); - bufferIndex = 0; - } - - if (dataLen == -1) { - --offset; - return flush(); - } else { - c = ioBuffer[bufferIndex++]; - } - - if (length > 0) { - --bufferIndex; - --offset; - return flush(); - } - push(c); - return flush(); - } -} - -void ChineseTokenizer::end() -{ - // set final offset - int32_t finalOffset = correctOffset(offset); - offsetAtt->setOffset(finalOffset, finalOffset); -} - -void ChineseTokenizer::reset() -{ - Tokenizer::reset(); - offset = 0; - bufferIndex = 0; - dataLen = 0; -} - -void ChineseTokenizer::reset(const ReaderPtr &input) -{ - Tokenizer::reset(input); - reset(); -} - -} diff --git a/src/dfm-search/3rdparty/fulltext/chinesetokenizer.h b/src/dfm-search/3rdparty/fulltext/chinesetokenizer.h deleted file mode 100644 index c93759d6..00000000 --- a/src/dfm-search/3rdparty/fulltext/chinesetokenizer.h +++ /dev/null @@ -1,69 +0,0 @@ -///////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2009-2014 Alan Wright. All rights reserved. -// Distributable under the terms of either the Apache License (Version 2.0) -// or the GNU Lesser General Public License. -///////////////////////////////////////////////////////////////////////////// - -#ifndef CHINESETOKENIZER_H -#define CHINESETOKENIZER_H - -#include - -/** - * An tokenizer that tokenizes chinese - * Only used for Lucene++ - */ -namespace Lucene { -class ChineseTokenizer : public Tokenizer -{ -public: - explicit ChineseTokenizer(const ReaderPtr &input); - ChineseTokenizer(const AttributeSourcePtr &source, const ReaderPtr &input); - ChineseTokenizer(const AttributeFactoryPtr &factory, const ReaderPtr &input); - - virtual ~ChineseTokenizer(); - - LUCENE_CLASS(ChineseTokenizer); - -protected: - /// Max word length - static const int32_t kMaxWordLen; - - static const int32_t kIoBufferSize; - -protected: - /// word offset, used to imply which character(in) is parsed - int32_t offset; - - /// the index used only for ioBuffer - int32_t bufferIndex; - - /// data length - int32_t dataLen; - - /// character buffer, store the characters which are used to compose the returned Token - CharArray buffer; - - /// I/O buffer, used to store the content of the input (one of the members of Tokenizer) - CharArray ioBuffer; - - TermAttributePtr termAtt; - OffsetAttributePtr offsetAtt; - - int32_t length; - int32_t start; - -public: - virtual void initialize(); - virtual bool incrementToken(); - virtual void end(); - virtual void reset(); - virtual void reset(const ReaderPtr &input); - -protected: - void push(wchar_t c); - bool flush(); -}; -} - -#endif // CHINESETOKENIZER_H diff --git a/src/dfm-search/dfm-search-client/CMakeLists.txt b/src/dfm-search/dfm-search-client/CMakeLists.txt index dcba205b..e630c69e 100644 --- a/src/dfm-search/dfm-search-client/CMakeLists.txt +++ b/src/dfm-search/dfm-search-client/CMakeLists.txt @@ -8,6 +8,8 @@ set(SRCS cli_options.h time_parser.cpp time_parser.h + size_parser.cpp + size_parser.h output/output_formatter.h output/text_output.cpp output/text_output.h diff --git a/src/dfm-search/dfm-search-client/cli_options.cpp b/src/dfm-search/dfm-search-client/cli_options.cpp index 63986e61..2c3bcdc0 100644 --- a/src/dfm-search/dfm-search-client/cli_options.cpp +++ b/src/dfm-search/dfm-search-client/cli_options.cpp @@ -4,6 +4,7 @@ #include "cli_options.h" #include "time_parser.h" +#include "size_parser.h" #include #include @@ -25,6 +26,7 @@ CliOptions::CliOptions() m_fileExtensionsOption(QStringList() << "file-extensions", "Filter by file extensions, comma separated", "extensions"), m_maxResultsOption(QStringList() << "max-results", "Maximum number of results (0 for unlimited)", "number", "0"), m_maxPreviewOption(QStringList() << "max-preview", "Max content preview length", "length", "200"), + m_filenameOption(QStringList() << "filename", "Search by filename in content/ocr index", "keyword"), m_wildcardOption(QStringList() << "wildcard", "Enable wildcard search with * and ? patterns"), m_jsonOption(QStringList() << "json" << "j", @@ -32,6 +34,9 @@ CliOptions::CliOptions() m_verboseOption(QStringList() << "verbose" << "v", "Enable verbose output with detailed result information"), + m_semanticOption(QStringList() << "semantic" + << "s", + "Enable semantic natural language search"), m_timeFieldOption(QStringList() << "time-field", "Time field to filter (birth or modify)", "field", "modify"), m_timeLastOption(QStringList() << "time-last", "Rolling time window (e.g., 3d, 2h, 30m)", "duration"), m_timeTodayOption(QStringList() << "time-today", "Filter files from today"), @@ -42,7 +47,9 @@ CliOptions::CliOptions() m_timeLastMonthOption(QStringList() << "time-last-month", "Filter files from last month"), m_timeThisYearOption(QStringList() << "time-this-year", "Filter files from this year"), m_timeLastYearOption(QStringList() << "time-last-year", "Filter files from last year"), - m_timeRangeOption(QStringList() << "time-range", "Custom time range (start,end)", "range") + m_timeRangeOption(QStringList() << "time-range", "Custom time range (start,end)", "range"), + m_sizeMinOption(QStringList() << "size-min", "Minimum file size (e.g., 1K, 10M, 1G, 512)", "size"), + m_sizeMaxOption(QStringList() << "size-max", "Maximum file size (e.g., 1K, 10M, 1G, 512)", "size") { setupOptions(); } @@ -52,7 +59,7 @@ void CliOptions::setupOptions() m_parser.setApplicationDescription("DFM Search Client"); m_parser.addHelpOption(); - // 基本选项 + // Basic options m_parser.addOption(m_typeOption); m_parser.addOption(m_methodOption); m_parser.addOption(m_queryOption); @@ -64,11 +71,13 @@ void CliOptions::setupOptions() m_parser.addOption(m_fileExtensionsOption); m_parser.addOption(m_maxResultsOption); m_parser.addOption(m_maxPreviewOption); + m_parser.addOption(m_filenameOption); m_parser.addOption(m_wildcardOption); m_parser.addOption(m_jsonOption); m_parser.addOption(m_verboseOption); + m_parser.addOption(m_semanticOption); - // 时间范围过滤选项 + // Time range filtering options m_parser.addOption(m_timeFieldOption); m_parser.addOption(m_timeLastOption); m_parser.addOption(m_timeTodayOption); @@ -81,14 +90,22 @@ void CliOptions::setupOptions() m_parser.addOption(m_timeLastYearOption); m_parser.addOption(m_timeRangeOption); - // 位置参数 + // File size range filtering options + m_parser.addOption(m_sizeMinOption); + m_parser.addOption(m_sizeMaxOption); + + // Positional arguments m_parser.addPositionalArgument("keyword", "Search keyword"); m_parser.addPositionalArgument("search_path", "Path to search in"); } void CliOptions::printHelp() const { - std::cout << "Usage: dfm-searcher [options] " << std::endl; + std::cout << "Usage: dfm-searcher [options] [search_path]" << std::endl; + std::cout << std::endl; + std::cout << "Semantic Search:" << std::endl; + std::cout << " --semantic, -s Enable semantic natural language search" << std::endl; + std::cout << " Example: dfm-searcher -s \"recent 3 days images\" /home/user" << std::endl; std::cout << std::endl; std::cout << "Search Types:" << std::endl; std::cout << " --type= Search type (default: filename)" << std::endl; @@ -108,6 +125,7 @@ void CliOptions::printHelp() const std::cout << " --file-extensions= Filter by file extensions, comma separated" << std::endl; std::cout << " --max-results= Maximum number of results (0 for unlimited)" << std::endl; std::cout << " --max-preview= Max content preview length (for content/ocr search)" << std::endl; + std::cout << " --filename= Search by filename in content/ocr index" << std::endl; std::cout << std::endl; std::cout << "Time Range Filter Options:" << std::endl; std::cout << " --time-field= Time field to filter (birth=creation, modify=modification)" << std::endl; @@ -125,6 +143,13 @@ void CliOptions::printHelp() const std::cout << " --time-range=, Custom time range (format: YYYY-MM-DD or \"YYYY-MM-DD HH:MM\")" << std::endl; std::cout << " Example: --time-range=\"2025-01-01,2025-12-31\"" << std::endl; std::cout << std::endl; + std::cout << "File Size Range Filter Options:" << std::endl; + std::cout << " --size-min= Minimum file size (e.g., 1K, 10M, 1G, 512)" << std::endl; + std::cout << " Units: K=KB, M=MB, G=GB, T=TB (default: bytes)" << std::endl; + std::cout << " --size-max= Maximum file size (e.g., 1K, 10M, 1G, 512)" << std::endl; + std::cout << " Units: K=KB, M=MB, G=GB, T=TB (default: bytes)" << std::endl; + std::cout << " Example: --size-min=1M --size-max=100M" << std::endl; + std::cout << std::endl; std::cout << "Output Options:" << std::endl; std::cout << " --json, -j Output results in JSON format" << std::endl; std::cout << " --verbose, -v Enable verbose output with detailed result information" << std::endl; @@ -142,29 +167,134 @@ void CliOptions::printHelp() const std::cout << std::endl; std::cout << " # Realtime search with time filter" << std::endl; std::cout << " dfm-searcher --method=realtime --time-last=7d \"report\" /home/user" << std::endl; + std::cout << std::endl; + std::cout << " # Semantic search: find recent images" << std::endl; + std::cout << " dfm-searcher --semantic \"recent 3 days images\" /home/user" << std::endl; + std::cout << std::endl; + std::cout << " # Semantic search with JSON output" << std::endl; + std::cout << " dfm-searcher -s -j \"content contains meeting notes\" /home/user" << std::endl; + std::cout << std::endl; + std::cout << " # Filename search with file size filter (1MB to 100MB)" << std::endl; + std::cout << " dfm-searcher --size-min=1M --size-max=100M \"video\" /home/user" << std::endl; + std::cout << std::endl; + std::cout << "Highlight Retrieval (on-demand):" << std::endl; + std::cout << " dfm-searcher highlight --type= [path2 ...] [-j]" << std::endl; + std::cout << " Fetch highlighted content snippets for specific files without running a full search." << std::endl; + std::cout << std::endl; + std::cout << " # Fetch highlight for a single file" << std::endl; + std::cout << " dfm-searcher highlight --type=content \"hello\" /home/user/doc.txt" << std::endl; + std::cout << std::endl; + std::cout << " # Batch fetch highlights with JSON output" << std::endl; + std::cout << " dfm-searcher highlight --type=ocr \"screenshot\" img1.png img2.png -j" << std::endl; } bool CliOptions::parse(QCoreApplication &app, SearchCliConfig &config) { + // Pre-scan for "highlight" subcommand + const QStringList rawArgs = app.arguments(); + if (rawArgs.size() >= 2 && rawArgs.at(1) == "highlight") { + config.subcommand = "highlight"; + } + m_parser.process(app); QStringList positionalArgs = m_parser.positionalArguments(); - if (positionalArgs.size() < 2) { + + // For highlight subcommand, the positional args are: keyword= + paths= + // "highlight" itself is consumed as first positional by QCommandLineParser + if (config.subcommand == "highlight") { + // Skip "highlight" keyword from positional args (it was parsed as the first positional) + QStringList args = positionalArgs; + if (!args.isEmpty() && args.first() == "highlight") { + args.removeFirst(); + } + if (args.isEmpty()) { + std::cerr << "Error: highlight requires and at least one " << std::endl; + return false; + } + + config.keyword = args.first(); + // Remaining args are file paths + config.searchPath = args.mid(1).join(','); // Reuse searchPath to store comma-separated paths + + // Validate search type for highlight + QString typeStr = m_parser.value(m_typeOption); + if (typeStr == "content") { + config.searchType = SearchType::Content; + } else if (typeStr == "ocr") { + config.searchType = SearchType::Ocr; + } else { + std::cerr << "Error: highlight requires --type=content or --type=ocr" << std::endl; + return false; + } + + config.jsonOutput = m_parser.isSet(m_jsonOption); + if (m_parser.isSet(m_maxPreviewOption)) { + bool ok; + int previewLength = m_parser.value(m_maxPreviewOption).toInt(&ok); + if (ok && previewLength > 0) { + config.maxPreviewLength = previewLength; + } + } + return true; + } + + if (positionalArgs.isEmpty()) { printHelp(); return false; } + // Semantic mode: only keyword is required, search path is optional + config.semanticMode = m_parser.isSet(m_semanticOption); config.keyword = positionalArgs.at(0); - config.searchPath = positionalArgs.at(1); + if (positionalArgs.size() >= 2) { + config.searchPath = positionalArgs.at(1); + } - // 验证搜索路径 - QFileInfo pathInfo(config.searchPath); - if (!pathInfo.exists() || !pathInfo.isDir()) { - std::cerr << "Error: Search path does not exist or is not a directory" << std::endl; + // Validate search path (not required in semantic mode) + if (!config.searchPath.isEmpty()) { + QFileInfo pathInfo(config.searchPath); + if (!pathInfo.exists() || !pathInfo.isDir()) { + std::cerr << "Error: Search path does not exist or is not a directory" << std::endl; + return false; + } + } else if (!config.semanticMode) { + std::cerr << "Error: Search path is required" << std::endl; + printHelp(); return false; } - // 解析搜索类型 + // Auto-enable includeHidden when search path contains hidden directory components. + // User who explicitly specifies a hidden path (e.g. ~/.local/share/Trash) + // expects results without needing --include-hidden. + config.includeHidden = m_parser.isSet(m_includeHiddenOption); + if (!config.includeHidden && !config.searchPath.isEmpty() + && Global::isHiddenPathOrInHiddenDir(config.searchPath)) { + config.includeHidden = true; + } + + // In semantic mode, skip type/method/query parsing + if (config.semanticMode) { + config.jsonOutput = m_parser.isSet(m_jsonOption); + config.verbose = m_parser.isSet(m_verboseOption); + if (m_parser.isSet(m_maxPreviewOption)) { + bool ok; + int previewLength = m_parser.value(m_maxPreviewOption).toInt(&ok); + if (ok && previewLength > 0) { + config.maxPreviewLength = previewLength; + } + } + if (m_parser.isSet(m_maxResultsOption)) { + bool ok; + int maxResults = m_parser.value(m_maxResultsOption).toInt(&ok); + if (ok && maxResults >= 0) { + config.maxResults = maxResults; + } + } + return true; + } + + // Parse search type (non-semantic mode only) QString typeStr = m_parser.value(m_typeOption); if (typeStr == "content") { config.searchType = SearchType::Content; @@ -197,7 +327,6 @@ bool CliOptions::parse(QCoreApplication &app, SearchCliConfig &config) // 解析开关选项 config.caseSensitive = m_parser.isSet(m_caseSensitiveOption); - config.includeHidden = m_parser.isSet(m_includeHiddenOption); config.pinyinEnabled = m_parser.isSet(m_pinyinOption); config.pinyinAcronymEnabled = m_parser.isSet(m_pinyinAcronymOption); config.jsonOutput = m_parser.isSet(m_jsonOption); @@ -220,6 +349,11 @@ bool CliOptions::parse(QCoreApplication &app, SearchCliConfig &config) #endif } + // 解析文件名搜索选项(仅对 content/ocr 搜索有效) + if (m_parser.isSet(m_filenameOption)) { + config.filenameKeyword = m_parser.value(m_filenameOption); + } + // 解析数值选项 if (m_parser.isSet(m_maxResultsOption)) { bool ok; @@ -238,7 +372,34 @@ bool CliOptions::parse(QCoreApplication &app, SearchCliConfig &config) } // 解析时间范围选项 - return parseTimeOptions(config); + if (!parseTimeOptions(config)) { + return false; + } + + // 解析文件大小范围选项 + if (m_parser.isSet(m_sizeMinOption)) { + qint64 minBytes = 0; + if (SizeParser::parseSize(m_parser.value(m_sizeMinOption), minBytes) && minBytes > 0) { + config.sizeFilter.setMin(minBytes); + config.hasSizeFilter = true; + } else { + std::cerr << "Error: Invalid --size-min format. Use format like '1K', '10M', '1G', or '512'" << std::endl; + return false; + } + } + + if (m_parser.isSet(m_sizeMaxOption)) { + qint64 maxBytes = 0; + if (SizeParser::parseSize(m_parser.value(m_sizeMaxOption), maxBytes) && maxBytes > 0) { + config.sizeFilter.setMax(maxBytes); + config.hasSizeFilter = true; + } else { + std::cerr << "Error: Invalid --size-max format. Use format like '1K', '10M', '1G', or '512'" << std::endl; + return false; + } + } + + return true; } bool CliOptions::parseTimeOptions(SearchCliConfig &config) diff --git a/src/dfm-search/dfm-search-client/cli_options.h b/src/dfm-search/dfm-search-client/cli_options.h index 10bdbf15..8e1179c6 100644 --- a/src/dfm-search/dfm-search-client/cli_options.h +++ b/src/dfm-search/dfm-search-client/cli_options.h @@ -12,6 +12,7 @@ #include #include #include +#include namespace dfmsearch { @@ -22,7 +23,10 @@ namespace dfmsearch { */ struct SearchCliConfig { - // 基本参数 + // Subcommand: if non-empty, the first positional arg is treated as a subcommand + QString subcommand; // "" (search), "highlight" + + // Basic parameters QString keyword; QString searchPath; SearchType searchType = SearchType::FileName; @@ -44,9 +48,19 @@ struct SearchCliConfig int maxResults = 0; // 0 表示不限制 int maxPreviewLength = 200; - // 时间范围过滤 + // 文件名搜索选项 + QString filenameKeyword; + + // Semantic mode + bool semanticMode = false; + + // Time range filtering bool hasTimeFilter = false; DFMSEARCH::TimeRangeFilter timeFilter; + + // File size range filtering + bool hasSizeFilter = false; + DFMSEARCH::SizeRangeFilter sizeFilter; }; /** @@ -93,11 +107,13 @@ class CliOptions QCommandLineOption m_fileExtensionsOption; QCommandLineOption m_maxResultsOption; QCommandLineOption m_maxPreviewOption; + QCommandLineOption m_filenameOption; QCommandLineOption m_wildcardOption; QCommandLineOption m_jsonOption; QCommandLineOption m_verboseOption; + QCommandLineOption m_semanticOption; - // 时间范围过滤选项 + // Time range filtering options QCommandLineOption m_timeFieldOption; QCommandLineOption m_timeLastOption; QCommandLineOption m_timeTodayOption; @@ -109,6 +125,10 @@ class CliOptions QCommandLineOption m_timeThisYearOption; QCommandLineOption m_timeLastYearOption; QCommandLineOption m_timeRangeOption; + + // File size range filtering options + QCommandLineOption m_sizeMinOption; + QCommandLineOption m_sizeMaxOption; }; } // namespace dfmsearch diff --git a/src/dfm-search/dfm-search-client/main.cpp b/src/dfm-search/dfm-search-client/main.cpp index 26ed2457..9500f719 100644 --- a/src/dfm-search/dfm-search-client/main.cpp +++ b/src/dfm-search/dfm-search-client/main.cpp @@ -4,6 +4,10 @@ #include #include +#include +#include +#include +#include #include #include @@ -12,11 +16,15 @@ #include #include #include +#include +#include #include "cli_options.h" #include "output/text_output.h" #include "output/json_output.h" +#include + using namespace dfmsearch; /** @@ -50,21 +58,32 @@ static void configureSearchOptions(SearchOptions &options, const SearchCliConfig } else if (config.searchType == SearchType::Content) { ContentOptionsAPI contentOptions(options); contentOptions.setMaxPreviewLength(config.maxPreviewLength); - contentOptions.setFullTextRetrievalEnabled(true); - contentOptions.setSearchResultHighlightEnabled(true); + contentOptions.setFullTextRetrievalEnabled(config.verbose); + contentOptions.setSearchResultHighlightEnabled(config.verbose); contentOptions.setFilenameContentMixedAndSearchEnabled(true); + if (!config.filenameKeyword.isEmpty()) { + contentOptions.setFilenameKeyword(config.filenameKeyword); + } } else if (config.searchType == SearchType::Ocr) { OcrTextOptionsAPI ocrTextOptions(options); ocrTextOptions.setMaxPreviewLength(config.maxPreviewLength); - ocrTextOptions.setFullTextRetrievalEnabled(true); - ocrTextOptions.setSearchResultHighlightEnabled(true); + ocrTextOptions.setFullTextRetrievalEnabled(config.verbose); + ocrTextOptions.setSearchResultHighlightEnabled(config.verbose); ocrTextOptions.setFilenameOcrContentMixedAndSearchEnabled(true); + if (!config.filenameKeyword.isEmpty()) { + ocrTextOptions.setFilenameKeyword(config.filenameKeyword); + } } // 应用时间范围过滤 if (config.hasTimeFilter) { options.setTimeRangeFilter(config.timeFilter); } + + // 应用文件大小范围过滤 + if (config.hasSizeFilter) { + options.setSizeRangeFilter(config.sizeFilter); + } } /** @@ -143,14 +162,121 @@ int main(int argc, char *argv[]) { QCoreApplication app(argc, argv); - // 解析命令行参数 + // Parse CLI arguments CliOptions cliOptions; SearchCliConfig config; if (!cliOptions.parse(app, config)) { return 1; } - // 创建搜索引擎 + // Highlight subcommand: fetch highlighted content on demand + if (config.subcommand == "highlight") { + DFMSEARCH::ContentRetriever retriever; + DFMSEARCH::HighlightOptions hlOptions; + hlOptions.maxPreviewLength = config.maxPreviewLength; + + // Paths are stored as comma-separated in config.searchPath +#if QT_VERSION >= QT_VERSION_CHECK(5, 15, 0) + QStringList paths = config.searchPath.split(',', Qt::SkipEmptyParts); +#else + QStringList paths = config.searchPath.split(',', QString::SkipEmptyParts); +#endif + + if (config.jsonOutput) { + // JSON output + QJsonObject root; + root["type"] = "highlight"; + root["searchType"] = (config.searchType == SearchType::Content) ? "content" : "ocr"; + root["keyword"] = config.keyword; + + QJsonArray results; + for (const QString &path : paths) { + QJsonObject item; + item["path"] = path; + item["contentMatch"] = retriever.fetchHighlight(path, config.keyword, config.searchType, hlOptions); + results.append(item); + } + + root["totalResults"] = results.size(); + root["results"] = results; + + QJsonDocument doc(root); + std::cout << doc.toJson(QJsonDocument::Indented).toStdString() << std::endl; + } else { + // Text output + QTextStream out(stdout); + for (const QString &path : paths) { + QString hl = retriever.fetchHighlight(path, config.keyword, config.searchType, hlOptions); + out << path << "\n"; + if (!hl.isEmpty()) { + out << " " << hl << "\n"; + } else { + out << " (no match)\n"; + } + out << Qt::endl; + } + } + return 0; + } + + // Semantic search mode + if (config.semanticMode) { + auto *semanticSearcher = new DFMSEARCH::SemanticSearcher(&app); + semanticSearcher->setDetailedResultsEnabled(config.verbose); + if (config.maxResults > 0) { + semanticSearcher->setMaxResults(config.maxResults); + } + + OutputFormatter *formatter = createOutputFormatter(config, &app); + + // 为语义搜索构建 formatter 需要的 options + SearchOptions formatterOptions; + formatterOptions.setDetailedResultsEnabled(config.verbose); + if (config.hasTimeFilter) formatterOptions.setTimeRangeFilter(config.timeFilter); + if (config.hasSizeFilter) formatterOptions.setSizeRangeFilter(config.sizeFilter); + + JsonOutput *jsonOutput = qobject_cast(formatter); + if (jsonOutput) { + jsonOutput->setSearchOptions(formatterOptions); + } + TextOutput *textOutput = qobject_cast(formatter); + if (textOutput) { + textOutput->setSearchOptions(formatterOptions); + textOutput->setVerbose(config.verbose); + } + + formatter->setSearchContext(config.keyword, config.searchPath, + SearchType::Semantic, SearchMethod::Indexed); + + QObject::connect(formatter, &OutputFormatter::finished, &app, &QCoreApplication::quit); + QObject::connect(semanticSearcher, &DFMSEARCH::SemanticSearcher::intentParsed, + [formatter](const DFMSEARCH::ParsedIntent &intent) { + if (auto *jsonOut = qobject_cast(formatter)) { + jsonOut->setParsedIntent(intent); + } + }); + QObject::connect(semanticSearcher, &DFMSEARCH::SemanticSearcher::searchStarted, [formatter]() { + formatter->outputSearchStarted(); + }); + QObject::connect(semanticSearcher, &DFMSEARCH::SemanticSearcher::searchFinished, [formatter](const SearchResultList &results) { + formatter->outputSearchFinished(results); + }); + QObject::connect(semanticSearcher, &DFMSEARCH::SemanticSearcher::searchCancelled, [formatter]() { + formatter->outputSearchCancelled(); + }); + QObject::connect(semanticSearcher, &DFMSEARCH::SemanticSearcher::errorOccurred, [formatter](const DFMSEARCH::SearchError &error) { + formatter->outputError(error); + }); + + QStringList semanticDirs; + if (!config.searchPath.isEmpty()) { + semanticDirs = QStringList { config.searchPath }; + } + semanticSearcher->search(config.keyword, semanticDirs); + return app.exec(); + } + + // Create search engine (non-semantic mode) SearchEngine *engine = SearchFactory::createEngine(config.searchType, &app); if (!engine) { qCritical() << "Error: Failed to create search engine"; diff --git a/src/dfm-search/dfm-search-client/output/json_output.cpp b/src/dfm-search/dfm-search-client/output/json_output.cpp index b6208a2d..e2979940 100644 --- a/src/dfm-search/dfm-search-client/output/json_output.cpp +++ b/src/dfm-search/dfm-search-client/output/json_output.cpp @@ -22,6 +22,125 @@ void JsonOutput::setSearchContext(const QString &keyword, const QString &searchP m_searchMethod = searchMethod; } +QJsonObject JsonOutput::intentToJson(const DFMSEARCH::ParsedIntent &intent) +{ + QJsonObject obj; + + // timeConstraint + if (intent.timeConstraint.isValid()) { + QJsonObject time; + QString kindStr; + switch (intent.timeConstraint.kind) { + case DFMSEARCH::TimeConstraintKind::Preset: + kindStr = "preset"; + { + QString presetStr; + switch (intent.timeConstraint.preset) { + case DFMSEARCH::TimePreset::Today: presetStr = "today"; break; + case DFMSEARCH::TimePreset::Yesterday: presetStr = "yesterday"; break; + case DFMSEARCH::TimePreset::DayBeforeYesterday: presetStr = "dayBeforeYesterday"; break; + case DFMSEARCH::TimePreset::ThisWeek: presetStr = "thisWeek"; break; + case DFMSEARCH::TimePreset::LastWeek: presetStr = "lastWeek"; break; + case DFMSEARCH::TimePreset::ThisMonth: presetStr = "thisMonth"; break; + case DFMSEARCH::TimePreset::LastMonth: presetStr = "lastMonth"; break; + case DFMSEARCH::TimePreset::ThisYear: presetStr = "thisYear"; break; + case DFMSEARCH::TimePreset::LastYear: presetStr = "lastYear"; break; + } + time["preset"] = presetStr; + } + break; + case DFMSEARCH::TimeConstraintKind::Relative: + kindStr = "relative"; + time["value"] = intent.timeConstraint.relativeValue; + { + QString unitStr; + switch (intent.timeConstraint.relativeUnit) { + case DFMSEARCH::TimeUnit::Minutes: unitStr = "minutes"; break; + case DFMSEARCH::TimeUnit::Hours: unitStr = "hours"; break; + case DFMSEARCH::TimeUnit::Days: unitStr = "days"; break; + case DFMSEARCH::TimeUnit::Weeks: unitStr = "weeks"; break; + case DFMSEARCH::TimeUnit::Months: unitStr = "months"; break; + case DFMSEARCH::TimeUnit::Years: unitStr = "years"; break; + } + time["unit"] = unitStr; + } + break; + case DFMSEARCH::TimeConstraintKind::Custom: + kindStr = "custom"; + if (intent.timeConstraint.customStart.isValid()) + time["start"] = intent.timeConstraint.customStart.toString(Qt::ISODate); + if (intent.timeConstraint.customEnd.isValid()) + time["end"] = intent.timeConstraint.customEnd.toString(Qt::ISODate); + break; + case DFMSEARCH::TimeConstraintKind::None: + break; + } + time["kind"] = kindStr; + + if (intent.timeConstraint.timeField != DFMSEARCH::TimeField::Unspecified) { + QString fieldStr; + switch (intent.timeConstraint.timeField) { + case DFMSEARCH::TimeField::ModifyTime: fieldStr = "modifyTime"; break; + case DFMSEARCH::TimeField::BirthTime: fieldStr = "birthTime"; break; + case DFMSEARCH::TimeField::Both: fieldStr = "both"; break; + default: fieldStr = "unspecified"; break; + } + time["timeField"] = fieldStr; + } + + obj["timeConstraint"] = time; + } + + // sizeConstraint + if (intent.sizeConstraint.isValid()) { + QJsonObject size; + if (intent.sizeConstraint.minSize > 0) + size["minBytes"] = intent.sizeConstraint.minSize; + if (intent.sizeConstraint.maxSize > 0) + size["maxBytes"] = intent.sizeConstraint.maxSize; + size["includeLower"] = intent.sizeConstraint.includeLower; + size["includeUpper"] = intent.sizeConstraint.includeUpper; + obj["sizeConstraint"] = size; + } + + // fileExtensions + if (!intent.fileExtensions.isEmpty()) { + obj["fileExtensions"] = QJsonArray::fromStringList(intent.fileExtensions); + } + + // searchDirectories (NLP-parsed, before caller override is applied) + if (!intent.searchDirectories.isEmpty()) { + obj["searchDirectories"] = QJsonArray::fromStringList(intent.searchDirectories); + } + + // keywords + if (!intent.keywords.isEmpty()) { + obj["keywords"] = QJsonArray::fromStringList(intent.keywords); + } + + // includeHidden + obj["includeHidden"] = intent.includeHidden; + + // consumedSpans + if (!intent.consumedSpans.isEmpty()) { + QJsonArray spans; + for (const auto &span : intent.consumedSpans) { + if (span.isValid()) { + QJsonObject s; + s["start"] = span.start; + s["end"] = span.end; + s["ruleId"] = span.ruleId; + spans.append(s); + } + } + if (!spans.isEmpty()) { + obj["consumedSpans"] = spans; + } + } + + return obj; +} + QJsonValue JsonOutput::resultToJson(const SearchResult &result) { // 如果不启用详细结果,只返回路径 @@ -40,6 +159,12 @@ QJsonValue JsonOutput::resultToJson(const SearchResult &result) if (!resultAPI.isDirectory()) { obj["fileType"] = resultAPI.fileType(); obj["size"] = resultAPI.size(); + + // 文件大小数值(字节) + qint64 fileSizeBytes = resultAPI.fileSizeBytes(); + if (fileSizeBytes > 0) { + obj["sizeBytes"] = fileSizeBytes; + } } QString filename = resultAPI.filename(); @@ -106,6 +231,12 @@ QJsonValue JsonOutput::resultToJson(const SearchResult &result) obj["birthTime"] = birthTimeObj; } + // 文件大小 + qint64 sizeBytes = resultAPI.fileSizeBytes(); + if (sizeBytes > 0) { + obj["sizeBytes"] = sizeBytes; + } + return obj; } else if (m_searchType == SearchType::Ocr) { // OCR 搜索:返回详细对象 @@ -146,6 +277,29 @@ QJsonValue JsonOutput::resultToJson(const SearchResult &result) obj["birthTime"] = birthTimeObj; } + // 文件校验和 + QString checksum = resultAPI.checksum(); + if (!checksum.isEmpty()) { + obj["checksum"] = checksum; + } + + // 文件大小 + qint64 sizeBytes = resultAPI.fileSizeBytes(); + if (sizeBytes > 0) { + obj["sizeBytes"] = sizeBytes; + } + + return obj; + } else if (m_searchType == SearchType::Semantic) { + // 语义搜索:结果来自多个子引擎,通用输出所有 customAttributes + QJsonObject obj; + obj["path"] = result.path(); + + const QVariantMap attrs = result.customAttributes(); + for (auto it = attrs.cbegin(); it != attrs.cend(); ++it) { + obj[it.key()] = QJsonValue::fromVariant(it.value()); + } + return obj; } return result.path(); @@ -184,6 +338,9 @@ void JsonOutput::outputStreamingStart() case SearchType::Ocr: searchTypeStr = "ocr"; break; + case SearchType::Semantic: + searchTypeStr = "semantic"; + break; default: searchTypeStr = "unknown"; } @@ -207,6 +364,26 @@ void JsonOutput::outputStreamingStart() searchInfo["timeRangeFilter"] = timeFilterInfo; } + // 添加文件大小范围过滤信息 + if (m_options.hasSizeRangeFilter()) { + DFMSEARCH::SizeRangeFilter sizeFilter = m_options.sizeRangeFilter(); + QJsonObject sizeFilterInfo; + if (sizeFilter.minSize() > 0) { + sizeFilterInfo["minBytes"] = sizeFilter.minSize(); + } + if (sizeFilter.maxSize() > 0) { + sizeFilterInfo["maxBytes"] = sizeFilter.maxSize(); + } + sizeFilterInfo["includeLower"] = sizeFilter.includeLower(); + sizeFilterInfo["includeUpper"] = sizeFilter.includeUpper(); + searchInfo["sizeRangeFilter"] = sizeFilterInfo; + } + + // 语义搜索:附加 ParsedIntent + if (m_searchType == SearchType::Semantic && m_parsedIntent.has_value()) { + searchInfo["intent"] = intentToJson(*m_parsedIntent); + } + startObj["search"] = searchInfo; startObj["timestamp"] = QDateTime::currentDateTime().toString(Qt::ISODate); @@ -281,6 +458,9 @@ void JsonOutput::outputCompleteResult(const QList &results) case SearchType::Ocr: searchTypeStr = "ocr"; break; + case SearchType::Semantic: + searchTypeStr = "semantic"; + break; default: searchTypeStr = "unknown"; } @@ -300,6 +480,26 @@ void JsonOutput::outputCompleteResult(const QList &results) searchInfo["timeRangeFilter"] = timeFilterInfo; } + // 添加文件大小范围过滤信息 + if (m_options.hasSizeRangeFilter()) { + DFMSEARCH::SizeRangeFilter sizeFilter = m_options.sizeRangeFilter(); + QJsonObject sizeFilterInfo; + if (sizeFilter.minSize() > 0) { + sizeFilterInfo["minBytes"] = sizeFilter.minSize(); + } + if (sizeFilter.maxSize() > 0) { + sizeFilterInfo["maxBytes"] = sizeFilter.maxSize(); + } + sizeFilterInfo["includeLower"] = sizeFilter.includeLower(); + sizeFilterInfo["includeUpper"] = sizeFilter.includeUpper(); + searchInfo["sizeRangeFilter"] = sizeFilterInfo; + } + + // 语义搜索:附加 ParsedIntent + if (m_searchType == SearchType::Semantic && m_parsedIntent.has_value()) { + searchInfo["intent"] = intentToJson(*m_parsedIntent); + } + root["search"] = searchInfo; // 时间戳 diff --git a/src/dfm-search/dfm-search-client/output/json_output.h b/src/dfm-search/dfm-search-client/output/json_output.h index 69d4db2f..a8c14d96 100644 --- a/src/dfm-search/dfm-search-client/output/json_output.h +++ b/src/dfm-search/dfm-search-client/output/json_output.h @@ -7,9 +7,11 @@ #include "output_formatter.h" #include +#include #include #include +#include namespace dfmsearch { @@ -40,6 +42,13 @@ class JsonOutput : public OutputFormatter */ void setSearchOptions(const SearchOptions &options) { m_options = options; } + /** + * @brief 设置语义搜索解析的 ParsedIntent + * + * 在 searchType 为 Semantic 时,intent 信息会序列化到 JSON 输出中 + */ + void setParsedIntent(const DFMSEARCH::ParsedIntent &intent) { m_parsedIntent = intent; } + /** * @brief 设置是否启用详细输出模式 * @param verbose true 启用详细输出 @@ -50,6 +59,9 @@ class JsonOutput : public OutputFormatter QJsonValue resultToJson(const SearchResult &result); void printJsonLine(const QJsonObject &obj); + // Intent 序列化辅助 + QJsonObject intentToJson(const DFMSEARCH::ParsedIntent &intent); + // 流式输出方法 void outputStreamingStart(); void outputStreamingResult(const SearchResult &result); @@ -69,6 +81,7 @@ class JsonOutput : public OutputFormatter bool m_streaming; bool m_verbose = false; QJsonArray m_collectedResults; + std::optional m_parsedIntent; }; } // namespace dfmsearch diff --git a/src/dfm-search/dfm-search-client/output/text_output.cpp b/src/dfm-search/dfm-search-client/output/text_output.cpp index c1cca72e..0edc37c9 100644 --- a/src/dfm-search/dfm-search-client/output/text_output.cpp +++ b/src/dfm-search/dfm-search-client/output/text_output.cpp @@ -38,6 +38,8 @@ void TextOutput::outputSearchStarted() typeStr = "Content"; else if (m_searchType == SearchType::Ocr) typeStr = "Ocr"; + else if (m_searchType == SearchType::Semantic) + typeStr = "Semantic"; std::cout << "Search type: " << typeStr.toStdString() << std::endl; std::cout << "Search method: " << (m_searchMethod == SearchMethod::Indexed ? "Indexed" : "Realtime") << std::endl; @@ -55,6 +57,20 @@ void TextOutput::outputSearchStarted() << " to " << end.toString("yyyy-MM-dd HH:mm:ss").toStdString(); std::cout << std::endl; } + + // 打印文件大小范围过滤 + if (m_options.hasSizeRangeFilter()) { + DFMSEARCH::SizeRangeFilter sizeFilter = m_options.sizeRangeFilter(); + std::cout << "Size range filter: "; + if (sizeFilter.minSize() > 0) { + std::cout << "min=" << sizeFilter.minSize() << " bytes"; + } + if (sizeFilter.maxSize() > 0) { + if (sizeFilter.minSize() > 0) std::cout << ", "; + std::cout << "max=" << sizeFilter.maxSize() << " bytes"; + } + std::cout << std::endl; + } } void TextOutput::printSearchResult(const SearchResult &result) @@ -75,7 +91,12 @@ void TextOutput::printSearchResult(const SearchResult &result) std::cout << " Type: Directory" << std::endl; } else { std::cout << " Type: " << resultAPI.fileType().toStdString() << std::endl; - std::cout << " Size: " << resultAPI.size().toStdString() << " bytes" << std::endl; + qint64 fileSizeBytes = resultAPI.fileSizeBytes(); + if (fileSizeBytes > 0) { + std::cout << " Size: " << fileSizeBytes << " bytes" << std::endl; + } else { + std::cout << " Size: " << resultAPI.size().toStdString() << " bytes" << std::endl; + } } // 文件名和扩展名 @@ -131,6 +152,12 @@ void TextOutput::printSearchResult(const SearchResult &result) std::cout << " Created: " << resultAPI.birthTimeString().toStdString() << " (timestamp: " << birthTs << ")" << std::endl; } + + // 文件大小 + qint64 sizeBytes = resultAPI.fileSizeBytes(); + if (sizeBytes > 0) { + std::cout << " Size: " << sizeBytes << " bytes" << std::endl; + } } else if (m_searchType == SearchType::Ocr) { OcrTextResultAPI resultAPI(const_cast(result)); @@ -163,6 +190,25 @@ void TextOutput::printSearchResult(const SearchResult &result) std::cout << " Created: " << resultAPI.birthTimeString().toStdString() << " (timestamp: " << birthTs << ")" << std::endl; } + + // 文件校验和 + QString checksum = resultAPI.checksum(); + if (!checksum.isEmpty()) { + std::cout << " Checksum: " << checksum.toStdString() << std::endl; + } + + // 文件大小 + qint64 sizeBytes = resultAPI.fileSizeBytes(); + if (sizeBytes > 0) { + std::cout << " Size: " << sizeBytes << " bytes" << std::endl; + } + } else if (m_searchType == SearchType::Semantic) { + // 语义搜索:通用输出所有 customAttributes + const QVariantMap attrs = result.customAttributes(); + for (auto it = attrs.cbegin(); it != attrs.cend(); ++it) { + std::cout << " " << it.key().toStdString() << ": " + << it.value().toString().toStdString() << std::endl; + } } std::cout << std::endl; diff --git a/src/dfm-search/dfm-search-client/size_parser.cpp b/src/dfm-search/dfm-search-client/size_parser.cpp new file mode 100644 index 00000000..d28720d3 --- /dev/null +++ b/src/dfm-search/dfm-search-client/size_parser.cpp @@ -0,0 +1,63 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "size_parser.h" + +namespace dfmsearch { + +bool SizeParser::parseSize(const QString &arg, qint64 &bytes) +{ + if (arg.isEmpty()) { + return false; + } + + QString trimmed = arg.trimmed(); + if (trimmed.isEmpty()) { + return false; + } + + // 提取数字部分和单位后缀 + QString numStr; + QString suffix; + + for (int i = 0; i < trimmed.length(); ++i) { + QChar c = trimmed[i]; + if (c.isDigit() || c == '.') { + numStr += c; + } else { + suffix = trimmed.mid(i).trimmed().toUpper(); + break; + } + } + + if (numStr.isEmpty()) { + return false; + } + + bool ok = false; + double value = numStr.toDouble(&ok); + if (!ok || value < 0) { + return false; + } + + // 根据后缀计算字节数 + qint64 multiplier = 1; + if (suffix == "K" || suffix == "KB") { + multiplier = 1024LL; + } else if (suffix == "M" || suffix == "MB") { + multiplier = 1024LL * 1024; + } else if (suffix == "G" || suffix == "GB") { + multiplier = 1024LL * 1024 * 1024; + } else if (suffix == "T" || suffix == "TB") { + multiplier = 1024LL * 1024 * 1024 * 1024; + } else if (!suffix.isEmpty()) { + // 未知后缀 + return false; + } + + bytes = static_cast(value * multiplier); + return true; +} + +} // namespace dfmsearch diff --git a/src/dfm-search/dfm-search-client/size_parser.h b/src/dfm-search/dfm-search-client/size_parser.h new file mode 100644 index 00000000..5ece29ba --- /dev/null +++ b/src/dfm-search/dfm-search-client/size_parser.h @@ -0,0 +1,39 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef SIZE_PARSER_H +#define SIZE_PARSER_H + +#include + +namespace dfmsearch { + +/** + * @brief 文件大小参数解析工具类 + * + * 支持解析人类可读的文件大小字符串,如 "1K", "10M", "1G", "512" + * 不带后缀的纯数字视为字节数。 + */ +class SizeParser +{ +public: + /** + * @brief 解析文件大小字符串 + * @param arg 输入字符串(如 "1K", "10M", "1G", "512") + * @param bytes 输出字节数 + * @return 解析成功返回true + * + * 支持的后缀(不区分大小写): + * - K/KB: 千字节 (1024) + * - M/MB: 兆字节 (1024^2) + * - G/GB: 吉字节 (1024^3) + * - T/TB: 太字节 (1024^4) + * - 无后缀: 纯字节数 + */ + static bool parseSize(const QString &arg, qint64 &bytes); +}; + +} // namespace dfmsearch + +#endif // SIZE_PARSER_H diff --git a/src/dfm-search/dfm-search-lib/CMakeLists.txt b/src/dfm-search/dfm-search-lib/CMakeLists.txt index b8c12c62..fe26131b 100644 --- a/src/dfm-search/dfm-search-lib/CMakeLists.txt +++ b/src/dfm-search/dfm-search-lib/CMakeLists.txt @@ -11,8 +11,6 @@ file(GLOB_RECURSE PUBLIC_INCLUDES CONFIGURE_DEPENDS FILE (GLOB_RECURSE SRCS CONFIGURE_DEPENDS "./*.cpp" "./*.h" - "../3rdparty/*.cpp" - "../3rdparty/*.h" ) # Qt6 diff --git a/src/dfm-search/dfm-search-lib/contentsearch/contentsearchengine.cpp b/src/dfm-search/dfm-search-lib/contentsearch/contentsearchengine.cpp index 882a3155..bca9df31 100644 --- a/src/dfm-search/dfm-search-lib/contentsearch/contentsearchengine.cpp +++ b/src/dfm-search/dfm-search-lib/contentsearch/contentsearchengine.cpp @@ -42,7 +42,8 @@ SearchError ContentSearchEngine::validateSearchConditions() } if (m_currentQuery.type() == SearchQuery::Type::Simple - && m_currentQuery.keyword().toUtf8().size() < Global::kMinContentSearchKeywordLength) { + && m_currentQuery.keyword().toUtf8().size() < Global::kMinContentSearchKeywordLength + && api.filenameKeyword().isEmpty()) { return SearchError(ContentSearchErrorCode::KeywordTooShort); } diff --git a/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.cpp b/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.cpp index bc498aa6..eab4797e 100644 --- a/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.cpp +++ b/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.cpp @@ -11,19 +11,17 @@ #include #include -#include #include #include #include +#include #include #include -#include "3rdparty/fulltext/chineseanalyzer.h" #include "utils/cancellablecollector.h" #include "utils/contenthighlighter.h" #include "utils/lucenequeryutils.h" -#include "utils/searchutility.h" #include "utils/lucene_cancellation_compat.h" #include "utils/timerangeutils.h" @@ -64,21 +62,16 @@ void ContentIndexedStrategy::search(const SearchQuery &query) } } -Lucene::QueryPtr ContentIndexedStrategy::buildLuceneQuery(const SearchQuery &query, const Lucene::AnalyzerPtr &analyzer, const QString &searchPath) +Lucene::QueryPtr ContentIndexedStrategy::buildLuceneQuery(const SearchQuery &query) { try { m_keywords.clear(); ContentOptionsAPI optAPI(m_options); // Use the member m_options bool mixedAndEnabled = optAPI.isFilenameContentMixedAndSearchEnabled(); - Lucene::QueryParserPtr contentsParser = newLucene( - Lucene::LuceneVersion::LUCENE_CURRENT, - LuceneFieldNames::Content::kContents, - analyzer); - Lucene::QueryPtr mainQuery; if (query.type() == SearchQuery::Type::Simple) { - mainQuery = buildSimpleContentsQuery(query, contentsParser); + mainQuery = buildSimpleContentsQuery(query); } else if (query.type() == SearchQuery::Type::Boolean) { if (query.subQueries().isEmpty()) { // For an empty boolean query, match nothing. @@ -87,12 +80,12 @@ Lucene::QueryPtr ContentIndexedStrategy::buildLuceneQuery(const SearchQuery &que // Determine which logic path to take for boolean queries if (mixedAndEnabled && query.booleanOperator() == SearchQuery::BooleanOperator::AND) { // New "advanced" AND logic for contents/filename - mainQuery = buildAdvancedAndQuery(query, contentsParser, analyzer); + mainQuery = buildAdvancedAndQuery(query); } else { // "Standard" contents-only logic for: // 1. OR queries (regardless of mixedAndEnabled value). // 2. AND queries when mixedAndEnabled is false. - mainQuery = buildStandardBooleanContentsQuery(query, contentsParser); + mainQuery = buildStandardBooleanContentsQuery(query); } } } else { @@ -100,35 +93,85 @@ Lucene::QueryPtr ContentIndexedStrategy::buildLuceneQuery(const SearchQuery &que mainQuery = newLucene(); // Should not happen } + // Add filename keyword query (before filters, so it replaces empty content query correctly) + QString filenameKw = optAPI.filenameKeyword(); + if (!filenameKw.isEmpty()) { + Lucene::QueryPtr filenameQuery = LuceneQueryUtils::buildNGramSearchQuery( + QString::fromWCharArray(LuceneFieldNames::Content::kFilename), + filenameKw); + + if (filenameQuery) { + // Check if content keywords are effectively empty + bool noContentKeywords = (query.type() == SearchQuery::Type::Simple) + ? query.keyword().isEmpty() + : (query.subQueries().isEmpty() + || std::all_of(query.subQueries().cbegin(), query.subQueries().cend(), + [](const auto &sq) { return sq.keyword().isEmpty(); })); + + if (noContentKeywords) { + // Filename-only search: replace empty content query with filename query + mainQuery = filenameQuery; + } else if (mainQuery) { + // Both content and filename: AND combination + BooleanQueryPtr finalQuery = newLucene(); + finalQuery->add(mainQuery, BooleanClause::MUST); + finalQuery->add(filenameQuery, BooleanClause::MUST); + mainQuery = finalQuery; + } + m_keywords.append(filenameKw); + } + } + // Add path prefix query optimization - if (mainQuery && SearchUtility::isContentIndexAncestorPathsSupported() - && SearchUtility::shouldUsePathPrefixQuery(searchPath)) { - QueryPtr pathPrefixQuery = LuceneQueryUtils::buildPathPrefixQuery(searchPath, - QString::fromWCharArray(LuceneFieldNames::Content::kAncestorPaths)); + QStringList searchPathsList = m_options.searchPaths(); + if (mainQuery) { + QueryPtr pathPrefixQuery = LuceneQueryUtils::buildMultiPathPrefixQuery( + searchPathsList, + QString::fromWCharArray(LuceneFieldNames::Content::kAncestorPaths)); if (pathPrefixQuery) { BooleanQueryPtr finalQuery = newLucene(); finalQuery->add(mainQuery, BooleanClause::MUST); finalQuery->add(pathPrefixQuery, BooleanClause::MUST); - qInfo() << "Using path prefix query for content search optimization:" << searchPath; + qInfo() << "Using multi-path prefix query for content search optimization:" << searchPathsList; mainQuery = finalQuery; } } + // Add excluded paths filter (pushed down to query layer to avoid per-doc filtering) + if (mainQuery) { + const QStringList &excludedPaths = m_options.searchExcludedPaths(); + if (!excludedPaths.isEmpty()) { + QueryPtr excludedQuery = LuceneQueryUtils::buildMultiPathPrefixQuery( + excludedPaths, + QString::fromWCharArray(LuceneFieldNames::Content::kAncestorPaths)); + if (excludedQuery) { + BooleanQueryPtr finalQuery = newLucene(); + finalQuery->add(mainQuery, BooleanClause::MUST); + finalQuery->add(excludedQuery, BooleanClause::MUST_NOT); + mainQuery = finalQuery; + } + } + } + + // Add hidden file filter (pushed down to query layer) + if (mainQuery && !m_options.includeHidden()) { + QueryPtr hiddenQuery = Lucene::newLucene( + Lucene::newLucene( + LuceneFieldNames::Content::kIsHidden, + L"Y")); + BooleanQueryPtr finalQuery = newLucene(); + finalQuery->add(mainQuery, BooleanClause::MUST); + finalQuery->add(hiddenQuery, BooleanClause::MUST_NOT); + mainQuery = finalQuery; + } + // Add time range filter query if (m_options.hasTimeRangeFilter()) { TimeRangeFilter filter = m_options.timeRangeFilter(); - auto [start, end] = filter.resolveTimeRange(); - - qint64 startEpoch = TimeRangeUtils::toEpochSecs(start); - qint64 endEpoch = TimeRangeUtils::toEpochSecs(end); - - const wchar_t *fieldName = (filter.timeField() == TimeField::BirthTime) - ? LuceneFieldNames::Content::kBirthTime - : LuceneFieldNames::Content::kModifyTime; - - QueryPtr timeQuery = TimeRangeUtils::buildNumericRangeQuery( - fieldName, startEpoch, endEpoch, - filter.includeLower(), filter.includeUpper()); + QueryPtr timeQuery = TimeRangeUtils::buildTimeRangeFilterQuery( + filter, + LuceneFieldNames::Content::kBirthTime, + LuceneFieldNames::Content::kModifyTime); if (timeQuery) { if (mainQuery) { @@ -143,6 +186,27 @@ Lucene::QueryPtr ContentIndexedStrategy::buildLuceneQuery(const SearchQuery &que } } + // Add file size range filter query + if (m_options.hasSizeRangeFilter()) { + SizeRangeFilter sizeFilter = m_options.sizeRangeFilter(); + QueryPtr sizeQuery = TimeRangeUtils::buildNumericRangeQuery( + LuceneFieldNames::Content::kFileSize, + sizeFilter.minSize(), sizeFilter.maxSize(), + sizeFilter.includeLower(), sizeFilter.includeUpper()); + + if (sizeQuery) { + if (mainQuery) { + BooleanQueryPtr finalQuery = newLucene(); + finalQuery->add(mainQuery, BooleanClause::MUST); + finalQuery->add(sizeQuery, BooleanClause::MUST); + mainQuery = finalQuery; + } else { + // Size filter alone is a valid query + mainQuery = sizeQuery; + } + } + } + return mainQuery; } catch (const Lucene::LuceneException &e) { @@ -154,15 +218,9 @@ Lucene::QueryPtr ContentIndexedStrategy::buildLuceneQuery(const SearchQuery &que } } -QueryPtr ContentIndexedStrategy::buildAdvancedAndQuery(const SearchQuery &query, const Lucene::QueryParserPtr &contentsParser, const Lucene::AnalyzerPtr &analyzer) +QueryPtr ContentIndexedStrategy::buildAdvancedAndQuery(const SearchQuery &query) { // This method implements the new "mixed" AND logic. - // It requires its own filenameParser. - Lucene::QueryParserPtr filenameParser = newLucene( - Lucene::LuceneVersion::LUCENE_CURRENT, - LuceneFieldNames::Content::kFilename, - analyzer); - Lucene::BooleanQueryPtr overallQuery = newLucene(); Lucene::BooleanQueryPtr mainAndClausesQuery = newLucene(); Lucene::BooleanQueryPtr allContentsQuery = newLucene(); @@ -176,10 +234,12 @@ QueryPtr ContentIndexedStrategy::buildAdvancedAndQuery(const SearchQuery &query, } hasValidKeywords = true; - // 使用 LuceneQueryUtils 处理特殊字符 - Lucene::String processedKeyword = LuceneQueryUtils::processQueryString(subQuery.keyword(), false); - Lucene::QueryPtr contentsTermQuery = contentsParser->parse(processedKeyword); - Lucene::QueryPtr filenameTermQuery = filenameParser->parse(processedKeyword); + Lucene::QueryPtr contentsTermQuery = LuceneQueryUtils::buildNGramSearchQuery( + QString::fromWCharArray(LuceneFieldNames::Content::kContents), + subQuery.keyword()); + Lucene::QueryPtr filenameTermQuery = LuceneQueryUtils::buildNGramSearchQuery( + QString::fromWCharArray(LuceneFieldNames::Content::kFilename), + subQuery.keyword()); // Build (contents:keyword OR filename:keyword) Lucene::BooleanQueryPtr combinedTermQuery = newLucene(); @@ -209,7 +269,7 @@ QueryPtr ContentIndexedStrategy::buildAdvancedAndQuery(const SearchQuery &query, return overallQuery; } -QueryPtr ContentIndexedStrategy::buildStandardBooleanContentsQuery(const SearchQuery &query, const Lucene::QueryParserPtr &contentsParser) +QueryPtr ContentIndexedStrategy::buildStandardBooleanContentsQuery(const SearchQuery &query) { // This method implements the "original" boolean logic, searching only "contents". Lucene::BooleanQueryPtr booleanQuery = newLucene(); @@ -220,8 +280,9 @@ QueryPtr ContentIndexedStrategy::buildStandardBooleanContentsQuery(const SearchQ continue; // Skip empty keywords } - // 使用 LuceneQueryUtils 处理特殊字符 - Lucene::QueryPtr termQuery = contentsParser->parse(LuceneQueryUtils::processQueryString(subQuery.keyword(), false)); + Lucene::QueryPtr termQuery = LuceneQueryUtils::buildNGramSearchQuery( + QString::fromWCharArray(LuceneFieldNames::Content::kContents), + subQuery.keyword()); booleanQuery->add(termQuery, query.booleanOperator() == SearchQuery::BooleanOperator::AND ? Lucene::BooleanClause::MUST : Lucene::BooleanClause::SHOULD); } @@ -229,14 +290,15 @@ QueryPtr ContentIndexedStrategy::buildStandardBooleanContentsQuery(const SearchQ return booleanQuery; } -QueryPtr ContentIndexedStrategy::buildSimpleContentsQuery(const SearchQuery &query, const Lucene::QueryParserPtr &contentsParser) +QueryPtr ContentIndexedStrategy::buildSimpleContentsQuery(const SearchQuery &query) { m_keywords.append(query.keyword()); if (query.keyword().isEmpty()) { return newLucene(); // Match nothing for empty keyword } - // 使用 LuceneQueryUtils 处理特殊字符 - return contentsParser->parse(LuceneQueryUtils::processQueryString(query.keyword(), false)); + return LuceneQueryUtils::buildNGramSearchQuery( + QString::fromWCharArray(LuceneFieldNames::Content::kContents), + query.keyword()); } void ContentIndexedStrategy::processSearchResults(const Lucene::IndexSearcherPtr &searcher, @@ -246,14 +308,33 @@ void ContentIndexedStrategy::processSearchResults(const Lucene::IndexSearcherPtr QElapsedTimer resultTimer; resultTimer.start(); - QString searchPath = m_options.searchPath(); - const QStringList &searchExcludedPaths = m_options.searchExcludedPaths(); auto docsSize = scoreDocs.size(); ContentOptionsAPI optAPI(m_options); bool enableHTML = optAPI.isSearchResultHighlightEnabled(); int previewLen = optAPI.maxPreviewLength() > 0 ? optAPI.maxPreviewLength() : 50; bool enableRetrieval = optAPI.isFullTextRetrievalEnabled(); + bool detailedResults = m_options.detailedResultsEnabled(); + + // Build field selector to avoid loading the large 'contents' field when not needed. + // The contents field stores full document text and loading it for every result + // (even when only path is needed) causes significant disk I/O overhead. + Lucene::Collection fieldsToLoad = Lucene::Collection::newInstance(); + if (enableRetrieval) { + fieldsToLoad.add(LuceneFieldNames::Content::kContents); + } + fieldsToLoad.add(LuceneFieldNames::Content::kPath); + if (Q_UNLIKELY(detailedResults)) { + fieldsToLoad.add(LuceneFieldNames::Content::kFilename); + fieldsToLoad.add(LuceneFieldNames::Content::kIsHidden); + fieldsToLoad.add(LuceneFieldNames::Content::kModifyTime); + fieldsToLoad.add(LuceneFieldNames::Content::kBirthTime); + fieldsToLoad.add(LuceneFieldNames::Content::kFileSize); + } + Lucene::FieldSelectorPtr fieldSelector = newLucene(fieldsToLoad); + + // Pre-allocate to avoid reallocation during append + m_results.reserve(m_results.size() + static_cast(docsSize)); for (int32_t i = 0; i < docsSize; ++i) { if (m_cancelled.load()) { @@ -263,21 +344,14 @@ void ContentIndexedStrategy::processSearchResults(const Lucene::IndexSearcherPtr try { Lucene::ScoreDocPtr scoreDoc = scoreDocs[i]; - if (!scoreDoc) { - qWarning() << "Null ScoreDoc encountered at index" << i; + if (!scoreDoc || scoreDoc->doc < 0) { + qWarning() << "Invalid ScoreDoc at index" << i; continue; } - // Defensive check: verify document ID is valid - if (scoreDoc->doc < 0) { - qWarning() << "Invalid document ID:" << scoreDoc->doc; - continue; - } - - // Safely retrieve document (could throw if index is corrupted) Lucene::DocumentPtr doc; try { - doc = searcher->doc(scoreDoc->doc); + doc = searcher->doc(scoreDoc->doc, fieldSelector); if (!doc) { qWarning() << "Failed to retrieve document at index:" << scoreDoc->doc; continue; @@ -290,53 +364,18 @@ void ContentIndexedStrategy::processSearchResults(const Lucene::IndexSearcherPtr continue; } - // Safely get path - Lucene::String pathField; - try { - pathField = doc->get(LuceneFieldNames::Content::kPath); - if (pathField.empty()) { - qWarning() << "Document missing path field at index:" << scoreDoc->doc; - continue; - } - } catch (const std::exception &e) { - qWarning() << "Exception retrieving path field:" << e.what(); - continue; - } - - QString path = QString::fromStdWString(pathField); - - if (!path.startsWith(searchPath)) { - continue; - } - - if (std::any_of(searchExcludedPaths.cbegin(), searchExcludedPaths.cend(), - [&path](const auto &excluded) { return path.startsWith(excluded); })) { + // Path filtering, hidden file exclusion — handled at query layer + Lucene::String pathField = doc->get(LuceneFieldNames::Content::kPath); + if (pathField.empty()) { + qWarning() << "Document missing path field at index:" << scoreDoc->doc; continue; } - // Safely check hidden status - if (Q_LIKELY(!m_options.includeHidden())) { - try { - Lucene::String hiddenField = doc->get(LuceneFieldNames::Content::kIsHidden); - if (!hiddenField.empty() && QString::fromStdWString(hiddenField).toLower() == "y") { - continue; - } - } catch (const std::exception &e) { - qWarning() << "Exception retrieving is_hidden field:" << e.what(); - // Default to visible if field can't be read - } - } - - // 创建搜索结果 - SearchResult result(path); - - // 设置内容结果 + SearchResult result(QString::fromStdWString(pathField)); ContentResultAPI resultApi(result); - // 使用ContentHighlighter命名空间进行高亮 if (enableRetrieval) { try { - // Safely get contents with null check Lucene::String contentField = doc->get(LuceneFieldNames::Content::kContents); if (!contentField.empty()) { const QString content = QString::fromStdWString(contentField); @@ -345,15 +384,13 @@ void ContentIndexedStrategy::processSearchResults(const Lucene::IndexSearcherPtr } } catch (const Lucene::LuceneException &e) { qWarning() << "Exception retrieving content field:" << QString::fromStdWString(e.getError()); - // Continue without content highlight } catch (const std::exception &e) { qWarning() << "Standard exception retrieving content field:" << e.what(); - // Continue without content highlight } } // 设置详细结果(如果启用) - if (Q_UNLIKELY(m_options.detailedResultsEnabled())) { + if (Q_UNLIKELY(detailedResults)) { // 文件名 Lucene::String filenameField = doc->get(LuceneFieldNames::Content::kFilename); if (!filenameField.empty()) { @@ -385,14 +422,24 @@ void ContentIndexedStrategy::processSearchResults(const Lucene::IndexSearcherPtr resultApi.setBirthTimestamp(timestamp); } } + + // 文件大小 + Lucene::String fileSizeField = doc->get(LuceneFieldNames::Content::kFileSize); + if (!fileSizeField.empty()) { + bool ok = false; + qint64 fileSize = QString::fromStdWString(fileSizeField).toLongLong(&ok); + if (ok && fileSize > 0) { + resultApi.setFileSizeBytes(fileSize); + } + } } // 添加到结果集合 - m_results.append(result); + m_results.append(std::move(result)); // 实时发送结果 if (Q_UNLIKELY(m_options.resultFoundEnabled())) - emit resultFound(result); + emit resultFound(m_results.last()); } catch (const Lucene::LuceneException &e) { qWarning() << "Error processing result:" << QString::fromStdWString(e.getError()); @@ -436,11 +483,8 @@ void ContentIndexedStrategy::performContentSearch(const SearchQuery &query) // 创建搜索器 IndexSearcherPtr searcher = newLucene(reader); - // 创建分析器 - AnalyzerPtr analyzer = newLucene(); - // 构建查询 - m_currentQuery = buildLuceneQuery(query, analyzer, m_options.searchPath()); + m_currentQuery = buildLuceneQuery(query); if (!m_currentQuery) { qWarning() << "Failed to build Lucene query"; emit errorOccurred(SearchError(ContentSearchErrorCode::ContentIndexException)); diff --git a/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.h b/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.h index 724474ed..95e56c76 100644 --- a/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.h +++ b/src/dfm-search/dfm-search-lib/contentsearch/contentstrategies/indexedstrategy.h @@ -7,7 +7,6 @@ #include "basestrategy.h" #include -#include #include #include #include @@ -39,22 +38,15 @@ class ContentIndexedStrategy : public ContentBaseStrategy void performContentSearch(const SearchQuery &query); // Build Lucene query - Lucene::QueryPtr buildLuceneQuery(const SearchQuery &query, const Lucene::AnalyzerPtr &analyzer, const QString &searchPath); + Lucene::QueryPtr buildLuceneQuery(const SearchQuery &query); // Helper for simple queries (original logic for "contents" field) - Lucene::QueryPtr buildSimpleContentsQuery( - const SearchQuery &query, - const Lucene::QueryParserPtr &contentsParser); + Lucene::QueryPtr buildSimpleContentsQuery(const SearchQuery &query); // Helper for "standard" boolean logic (original logic for "contents" field, handles AND/OR) - Lucene::QueryPtr buildStandardBooleanContentsQuery( - const SearchQuery &query, - const Lucene::QueryParserPtr &contentsParser); + Lucene::QueryPtr buildStandardBooleanContentsQuery(const SearchQuery &query); // Helper for "advanced" mixed AND logic (searches "contents" and "filename") - Lucene::QueryPtr buildAdvancedAndQuery( - const SearchQuery &query, // Operator is implicitly AND - const Lucene::QueryParserPtr &contentsParser, - const Lucene::AnalyzerPtr &analyzer); // Analyzer is needed to create filenameParser + Lucene::QueryPtr buildAdvancedAndQuery(const SearchQuery &query); // Operator is implicitly AND // Process search results void processSearchResults(const Lucene::IndexSearcherPtr &searcher, diff --git a/src/dfm-search/dfm-search-lib/core/searchfactory.cpp b/src/dfm-search/dfm-search-lib/core/searchfactory.cpp index 4d092ccf..d266b617 100644 --- a/src/dfm-search/dfm-search-lib/core/searchfactory.cpp +++ b/src/dfm-search/dfm-search-lib/core/searchfactory.cpp @@ -20,6 +20,7 @@ SearchEngine *SearchFactory::createEngine(SearchType type, QObject *parent) case SearchType::Ocr: engine = new SearchEngine(type, parent); break; + case SearchType::Semantic: case SearchType::Custom: // TODO: Created by application based on provider break; diff --git a/src/dfm-search/dfm-search-lib/core/searchoptions.cpp b/src/dfm-search/dfm-search-lib/core/searchoptions.cpp index 8620b773..07376898 100644 --- a/src/dfm-search/dfm-search-lib/core/searchoptions.cpp +++ b/src/dfm-search/dfm-search-lib/core/searchoptions.cpp @@ -78,12 +78,35 @@ void SearchOptions::setCaseSensitive(bool sensitive) QString SearchOptions::searchPath() const { + if (!d->searchPathsList.isEmpty()) { + return d->searchPathsList.first(); + } return d->searchPath; } void SearchOptions::setSearchPath(const QString &path) { d->searchPath = path; + d->searchPathsList.clear(); +} + +QStringList SearchOptions::searchPaths() const +{ + if (!d->searchPathsList.isEmpty()) { + return d->searchPathsList; + } + if (!d->searchPath.isEmpty()) { + return QStringList { d->searchPath }; + } + return {}; +} + +void SearchOptions::setSearchPaths(const QStringList &paths) +{ + d->searchPathsList = paths; + if (!paths.isEmpty()) { + d->searchPath = paths.first(); + } } QStringList SearchOptions::searchExcludedPaths() const @@ -197,4 +220,24 @@ void SearchOptions::clearTimeRangeFilter() d->timeRangeFilter.clear(); } +void SearchOptions::setSizeRangeFilter(const SizeRangeFilter &filter) +{ + d->sizeRangeFilter = filter; +} + +SizeRangeFilter SearchOptions::sizeRangeFilter() const +{ + return d->sizeRangeFilter; +} + +bool SearchOptions::hasSizeRangeFilter() const +{ + return d->sizeRangeFilter.isValid(); +} + +void SearchOptions::clearSizeRangeFilter() +{ + d->sizeRangeFilter.clear(); +} + DFM_SEARCH_END_NS diff --git a/src/dfm-search/dfm-search-lib/core/searchoptionsdata.h b/src/dfm-search/dfm-search-lib/core/searchoptionsdata.h index b5813887..0f99ed57 100644 --- a/src/dfm-search/dfm-search-lib/core/searchoptionsdata.h +++ b/src/dfm-search/dfm-search-lib/core/searchoptionsdata.h @@ -9,6 +9,7 @@ #include #include +#include DFM_SEARCH_BEGIN_NS @@ -30,6 +31,7 @@ class SearchOptionsData SearchMethod method; ///< The search method to use bool caseSensitive; ///< Whether search is case sensitive QString searchPath; ///< The path to search in + QStringList searchPathsList; ///< Multiple search paths QStringList searchExcludedPaths; ///< excluded search paths. bool includeHidden; ///< Whether to include hidden files int maxResults; ///< Maximum number of results to return @@ -39,6 +41,7 @@ class SearchOptionsData int syncSearchTimeoutSecs { 60 }; int batchTimeMs { 1000 }; ///< Batch processing time interval in milliseconds TimeRangeFilter timeRangeFilter; ///< Time range filter for search + SizeRangeFilter sizeRangeFilter; ///< File size range filter for search }; DFM_SEARCH_END_NS diff --git a/src/dfm-search/dfm-search-lib/core/sizerangefilter.cpp b/src/dfm-search/dfm-search-lib/core/sizerangefilter.cpp new file mode 100644 index 00000000..bc3b50b5 --- /dev/null +++ b/src/dfm-search/dfm-search-lib/core/sizerangefilter.cpp @@ -0,0 +1,126 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later +#include + +DFM_SEARCH_BEGIN_NS + +class SizeRangeFilterData +{ +public: + SizeRangeFilterData() + : minSize(0), maxSize(0), includeLower(true), includeUpper(true) + { + } + + SizeRangeFilterData(const SizeRangeFilterData &other) + : minSize(other.minSize), maxSize(other.maxSize), + includeLower(other.includeLower), includeUpper(other.includeUpper) + { + } + + qint64 minSize; + qint64 maxSize; + bool includeLower; + bool includeUpper; +}; + +SizeRangeFilter::SizeRangeFilter() + : d(std::make_unique()) +{ +} + +SizeRangeFilter::SizeRangeFilter(const SizeRangeFilter &other) + : d(std::make_unique(*other.d)) +{ +} + +SizeRangeFilter::SizeRangeFilter(SizeRangeFilter &&other) noexcept + : d(std::move(other.d)) +{ +} + +SizeRangeFilter::~SizeRangeFilter() = default; + +SizeRangeFilter &SizeRangeFilter::operator=(const SizeRangeFilter &other) +{ + if (this != &other) { + d = std::make_unique(*other.d); + } + return *this; +} + +SizeRangeFilter &SizeRangeFilter::operator=(SizeRangeFilter &&other) noexcept +{ + if (this != &other) { + d = std::move(other.d); + } + return *this; +} + +SizeRangeFilter &SizeRangeFilter::setMin(qint64 minSize) +{ + d->minSize = minSize; + return *this; +} + +SizeRangeFilter &SizeRangeFilter::setMax(qint64 maxSize) +{ + d->maxSize = maxSize; + return *this; +} + +SizeRangeFilter &SizeRangeFilter::setRange(qint64 minSize, qint64 maxSize) +{ + d->minSize = minSize; + d->maxSize = maxSize; + return *this; +} + +qint64 SizeRangeFilter::minSize() const +{ + return d->minSize; +} + +qint64 SizeRangeFilter::maxSize() const +{ + return d->maxSize; +} + +SizeRangeFilter &SizeRangeFilter::setIncludeLower(bool include) +{ + d->includeLower = include; + return *this; +} + +SizeRangeFilter &SizeRangeFilter::setIncludeUpper(bool include) +{ + d->includeUpper = include; + return *this; +} + +bool SizeRangeFilter::includeLower() const +{ + return d->includeLower; +} + +bool SizeRangeFilter::includeUpper() const +{ + return d->includeUpper; +} + +SizeRangeFilter &SizeRangeFilter::clear() +{ + d->minSize = 0; + d->maxSize = 0; + d->includeLower = true; + d->includeUpper = true; + return *this; +} + +bool SizeRangeFilter::isValid() const +{ + return d->minSize > 0 || d->maxSize > 0; +} + +DFM_SEARCH_END_NS diff --git a/src/dfm-search/dfm-search-lib/dfm-search.cmake b/src/dfm-search/dfm-search-lib/dfm-search.cmake index 6696236b..6b0ce4f2 100644 --- a/src/dfm-search/dfm-search-lib/dfm-search.cmake +++ b/src/dfm-search/dfm-search-lib/dfm-search.cmake @@ -18,6 +18,10 @@ add_library(${BIN_NAME} SHARED ${SRCS} ) +target_compile_definitions(${BIN_NAME} PRIVATE + CMAKE_INSTALL_PREFIX="${CMAKE_INSTALL_PREFIX}" +) + target_link_libraries(${BIN_NAME} PUBLIC Qt${QT_VERSION_MAJOR}::Core Dtk${DFM_VERSION_MAJOR}::Core @@ -76,6 +80,12 @@ install(DIRECTORY FILES_MATCHING PATTERN "*.h" ) +# install semantic search rules (locale subdirectories preserved) +install(DIRECTORY + ${CMAKE_CURRENT_SOURCE_DIR}/semantic/rules/ + DESTINATION share/deepin/dfm-search/semantic/rules +) + # for pc file config - update to include all dependencies set(PC_LIBS_PRIVATE Qt${QT_VERSION_MAJOR}Core dtk${DFM_VERSION_MAJOR}core) set(PC_REQ_PRIVATE liblucene++ liblucene++-contrib) diff --git a/src/dfm-search/dfm-search-lib/filenamesearch/filenamesearchapi.cpp b/src/dfm-search/dfm-search-lib/filenamesearch/filenamesearchapi.cpp index 8770125a..c47de316 100644 --- a/src/dfm-search/dfm-search-lib/filenamesearch/filenamesearchapi.cpp +++ b/src/dfm-search/dfm-search-lib/filenamesearch/filenamesearchapi.cpp @@ -168,4 +168,16 @@ QString FileNameResultAPI::birthTimeString() const return ts > 0 ? TimeResultAPI::formatTimestamp(ts) : QString(); } +// ==================== File Size (Numeric) ==================== + +void FileNameResultAPI::setFileSizeBytes(qint64 bytes) +{ + m_result.setCustomAttribute("fileSizeBytes", bytes); +} + +qint64 FileNameResultAPI::fileSizeBytes() const +{ + return m_result.customAttribute("fileSizeBytes").toLongLong(); +} + DFM_SEARCH_END_NS diff --git a/src/dfm-search/dfm-search-lib/filenamesearch/filenamesearchengine.cpp b/src/dfm-search/dfm-search-lib/filenamesearch/filenamesearchengine.cpp index 8b1085ef..301bf1a4 100644 --- a/src/dfm-search/dfm-search-lib/filenamesearch/filenamesearchengine.cpp +++ b/src/dfm-search/dfm-search-lib/filenamesearch/filenamesearchengine.cpp @@ -56,8 +56,9 @@ SearchError FileNameSearchEngine::validateSearchConditions() // 文件名搜索特定验证 if (m_currentQuery.type() == SearchQuery::Type::Simple || m_currentQuery.type() == SearchQuery::Type::Wildcard) { - // 允许对一个类型, 后缀进行搜索,获取类型下所有文件 - if (m_currentQuery.keyword().isEmpty() && fileTypes.isEmpty() && fileExts.isEmpty()) { + // 允许对类型/后缀/大小范围/时间范围进行搜索,获取满足条件的所有文件 + if (m_currentQuery.keyword().isEmpty() && fileTypes.isEmpty() && fileExts.isEmpty() + && !m_options.hasSizeRangeFilter() && !m_options.hasTimeRangeFilter()) { return SearchError(FileNameSearchErrorCode::KeywordIsEmpty); } diff --git a/src/dfm-search/dfm-search-lib/filenamesearch/filenamestrategies/indexedstrategy.cpp b/src/dfm-search/dfm-search-lib/filenamesearch/filenamestrategies/indexedstrategy.cpp index 6746fa2f..985f7dde 100644 --- a/src/dfm-search/dfm-search-lib/filenamesearch/filenamestrategies/indexedstrategy.cpp +++ b/src/dfm-search/dfm-search-lib/filenamesearch/filenamestrategies/indexedstrategy.cpp @@ -14,8 +14,8 @@ #include #include +#include -#include "3rdparty/fulltext/chineseanalyzer.h" #include "utils/cancellablecollector.h" #include "utils/searchutility.h" #include "utils/lucenequeryutils.h" @@ -84,9 +84,8 @@ Lucene::QueryPtr QueryBuilder::buildPinyinQuery(const QStringList &pinyins, Sear for (const QString &pinyin : pinyins) { QString cleanPinyin = pinyin.trimmed(); if (!cleanPinyin.isEmpty() && Global::isPinyinSequence(cleanPinyin)) { - // 复用buildCommonQuery,指定pinyin字段,让分析器自动处理匹配 - QueryPtr termQuery = buildCommonQuery(cleanPinyin, false, newLucene(), - QString::fromWCharArray(LuceneFieldNames::FileName::kPinyin), false); + QueryPtr termQuery = LuceneQueryUtils::buildNGramSearchQuery( + QString::fromWCharArray(LuceneFieldNames::FileName::kPinyin), cleanPinyin); if (termQuery) { pinyinQuery->add(termQuery, op == SearchQuery::BooleanOperator::AND ? BooleanClause::MUST : BooleanClause::SHOULD); } @@ -107,10 +106,8 @@ Lucene::QueryPtr QueryBuilder::buildPinyinAcronymQuery(const QStringList &acrony for (const QString &acronym : acronyms) { QString cleanAcronym = acronym.trimmed(); if (!cleanAcronym.isEmpty()) { - // 复用buildCommonQuery,指定pinyin_acronym字段,让分析器自动处理匹配 - QueryPtr termQuery = buildCommonQuery(cleanAcronym, false, - newLucene(), - QString::fromWCharArray(LuceneFieldNames::FileName::kPinyinAcronym), false); + QueryPtr termQuery = LuceneQueryUtils::buildNGramSearchQuery( + QString::fromWCharArray(LuceneFieldNames::FileName::kPinyinAcronym), cleanAcronym); if (termQuery) { acronymQuery->add(termQuery, op == SearchQuery::BooleanOperator::AND ? BooleanClause::MUST : BooleanClause::SHOULD); } @@ -120,48 +117,13 @@ Lucene::QueryPtr QueryBuilder::buildPinyinAcronymQuery(const QStringList &acrony return acronymQuery; } -Lucene::QueryPtr QueryBuilder::buildCommonQuery(const QString &keyword, bool caseSensitive, const Lucene::AnalyzerPtr &analyzer, bool allowWildcard) const +Lucene::QueryPtr QueryBuilder::buildSimpleQuery(const QString &keyword, bool caseSensitive) const { - if (keyword.isEmpty() || !analyzer) { - return nullptr; - } - - Lucene::QueryParserPtr parser = newLucene( - Lucene::LuceneVersion::LUCENE_CURRENT, - LuceneFieldNames::FileName::kFileName, - analyzer); - - if (allowWildcard) { - parser->setAllowLeadingWildcard(true); - } - - return parser->parse(LuceneQueryUtils::processQueryString(keyword, caseSensitive)); -} - -Lucene::QueryPtr QueryBuilder::buildCommonQuery(const QString &keyword, bool caseSensitive, const Lucene::AnalyzerPtr &analyzer, const QString &fieldName, bool allowWildcard) const -{ - if (keyword.isEmpty() || !analyzer || fieldName.isEmpty()) { - return nullptr; - } - - Lucene::QueryParserPtr parser = newLucene( - Lucene::LuceneVersion::LUCENE_CURRENT, - StringUtils::toUnicode(fieldName.toStdString()), - analyzer); - - if (allowWildcard) { - parser->setAllowLeadingWildcard(true); - } - - return parser->parse(LuceneQueryUtils::processQueryString(keyword, caseSensitive)); + return LuceneQueryUtils::buildNGramSearchQuery( + QString::fromWCharArray(LuceneFieldNames::FileName::kFileName), keyword, caseSensitive); } -Lucene::QueryPtr QueryBuilder::buildSimpleQuery(const QString &keyword, bool caseSensitive, const Lucene::AnalyzerPtr &analyzer) const -{ - return buildCommonQuery(keyword, caseSensitive, analyzer, false); -} - -Lucene::QueryPtr QueryBuilder::buildWildcardQuery(const QString &keyword, bool caseSensitive, const Lucene::AnalyzerPtr &analyzer) const +Lucene::QueryPtr QueryBuilder::buildWildcardQuery(const QString &keyword, bool caseSensitive) const { if (keyword.isEmpty()) { return nullptr; @@ -176,27 +138,6 @@ Lucene::QueryPtr QueryBuilder::buildWildcardQuery(const QString &keyword, bool c StringUtils::toUnicode(processedKeyword.toStdString()))); } -Lucene::QueryPtr QueryBuilder::buildBooleanQuery(const QStringList &terms, bool caseSensitive, SearchQuery::BooleanOperator op, const Lucene::AnalyzerPtr &analyzer) const -{ - if (terms.isEmpty() || !analyzer) { - return nullptr; - } - - BooleanQueryPtr booleanQuery = newLucene(); - booleanQuery->setMaxClauseCount(1024); - - for (const QString &term : terms) { - if (!term.isEmpty()) { - QueryPtr termQuery = buildCommonQuery(term, caseSensitive, analyzer, false); - if (termQuery) { - booleanQuery->add(termQuery, op == SearchQuery::BooleanOperator::AND ? BooleanClause::MUST : BooleanClause::SHOULD); - } - } - } - - return booleanQuery; -} - //-------------------------------------------------------------------- // IndexManager 实现 //-------------------------------------------------------------------- @@ -315,8 +256,6 @@ void FileNameIndexedStrategy::search(const SearchQuery &query) void FileNameIndexedStrategy::performIndexSearch(const SearchQuery &query, const FileNameOptionsAPI &api) { bool caseSensitive = m_options.caseSensitive(); - const QString &searchPath = m_options.searchPath(); - const QStringList &searchExcludedPaths = m_options.searchExcludedPaths(); QStringList fileTypes = api.fileTypes(); QStringList fileExtensions = api.fileExtensions(); @@ -330,7 +269,7 @@ void FileNameIndexedStrategy::performIndexSearch(const SearchQuery &query, const IndexQuery indexQuery = buildIndexQuery(query, searchType, caseSensitive, pinyinEnabled, pinyinAcronymEnabled, fileTypes, fileExtensions); // 3. 执行查询并处理结果 - executeIndexQuery(indexQuery, searchPath, searchExcludedPaths); + executeIndexQuery(indexQuery); } FileNameIndexedStrategy::SearchType FileNameIndexedStrategy::determineSearchType( @@ -445,7 +384,7 @@ FileNameIndexedStrategy::IndexQuery FileNameIndexedStrategy::buildIndexQuery( return result; } -void FileNameIndexedStrategy::executeIndexQuery(const IndexQuery &query, const QString &searchPath, const QStringList &searchExcludedPaths) +void FileNameIndexedStrategy::executeIndexQuery(const IndexQuery &query) { // 获取索引目录 FSDirectoryPtr directory = m_indexManager->getIndexDirectory(m_indexDir); @@ -478,7 +417,7 @@ void FileNameIndexedStrategy::executeIndexQuery(const IndexQuery &query, const Q // 构建查询 QueryPtr luceneQuery; try { - luceneQuery = buildLuceneQuery(query, searchPath); + luceneQuery = buildLuceneQuery(query); if (!luceneQuery) { emit errorOccurred(SearchError(SearchErrorCode::InvalidQuery)); return; @@ -533,27 +472,14 @@ void FileNameIndexedStrategy::executeIndexQuery(const IndexQuery &query, const Q DocumentPtr doc = searcher->doc(scoreDoc->doc); QString path = QString::fromStdWString(doc->get(LuceneFieldNames::FileName::kFullPath)); - if (!path.startsWith(searchPath)) { - continue; - } - - if (std::any_of(searchExcludedPaths.cbegin(), searchExcludedPaths.cend(), - [&path](const auto &excluded) { return path.startsWith(excluded); })) { - continue; - } - - if (Q_LIKELY(!m_options.includeHidden())) { - if (QString::fromStdWString(doc->get(LuceneFieldNames::FileName::kIsHidden)).toLower() == "y") - continue; - } + // Path filtering, excluded paths, hidden file — handled at query layer // 处理搜索结果 if (Q_UNLIKELY(m_options.detailedResultsEnabled())) { m_results.append(processDetailedSearchResult(path, doc)); } else { - // perf: quickly SearchResult result(path); - m_results.append(result); + m_results.append(std::move(result)); } // 实时发送结果 @@ -594,6 +520,16 @@ SearchResult FileNameIndexedStrategy::processDetailedSearchResult( QString size = QString::fromStdWString(doc->get(LuceneFieldNames::FileName::kFileSizeStr)); api.setSize(size); + // 文件大小(数值,字节) + QString fileSizeStr = QString::fromStdWString(doc->get(LuceneFieldNames::FileName::kFileSize)); + if (!fileSizeStr.isEmpty()) { + bool ok = false; + qint64 fileSizeBytes = fileSizeStr.toLongLong(&ok); + if (ok && fileSizeBytes >= 0) { + api.setFileSizeBytes(fileSizeBytes); + } + } + // 隐藏状态 QString isHiddenStr = QString::fromStdWString(doc->get(LuceneFieldNames::FileName::kIsHidden)).toLower(); api.setIsHidden(isHiddenStr == "y"); @@ -621,16 +557,15 @@ SearchResult FileNameIndexedStrategy::processDetailedSearchResult( return result; } -Lucene::QueryPtr FileNameIndexedStrategy::buildLuceneQuery(const IndexQuery &query, const QString &searchPath) const +Lucene::QueryPtr FileNameIndexedStrategy::buildLuceneQuery(const IndexQuery &query) const { BooleanQueryPtr finalQuery = newLucene(); bool hasValidQuery = false; - AnalyzerPtr analyzer = newLucene(); switch (query.type) { case SearchType::Simple: if (!query.terms.isEmpty()) { - QueryPtr simpleQuery = m_queryBuilder->buildSimpleQuery(query.terms.first(), query.caseSensitive, analyzer); + QueryPtr simpleQuery = m_queryBuilder->buildSimpleQuery(query.terms.first(), query.caseSensitive); if (simpleQuery) { finalQuery->add(simpleQuery, BooleanClause::MUST); hasValidQuery = true; @@ -639,7 +574,7 @@ Lucene::QueryPtr FileNameIndexedStrategy::buildLuceneQuery(const IndexQuery &que break; case SearchType::Wildcard: if (!query.terms.isEmpty()) { - QueryPtr wildcardQuery = m_queryBuilder->buildWildcardQuery(query.terms.first(), query.caseSensitive, analyzer); + QueryPtr wildcardQuery = m_queryBuilder->buildWildcardQuery(query.terms.first(), query.caseSensitive); if (wildcardQuery) { finalQuery->add(wildcardQuery, BooleanClause::MUST); hasValidQuery = true; @@ -648,7 +583,7 @@ Lucene::QueryPtr FileNameIndexedStrategy::buildLuceneQuery(const IndexQuery &que break; case SearchType::Boolean: if (!query.terms.isEmpty()) { - BooleanQueryPtr booleanQuery = buildBooleanTermsQuery(query, analyzer); + BooleanQueryPtr booleanQuery = buildBooleanTermsQuery(query); if (booleanQuery) { finalQuery->add(booleanQuery, BooleanClause::MUST); hasValidQuery = true; @@ -669,7 +604,7 @@ Lucene::QueryPtr FileNameIndexedStrategy::buildLuceneQuery(const IndexQuery &que } // 添加普通关键词查询 - QueryPtr simpleQuery = m_queryBuilder->buildSimpleQuery(query.terms.first(), query.caseSensitive, analyzer); + QueryPtr simpleQuery = m_queryBuilder->buildSimpleQuery(query.terms.first(), query.caseSensitive); if (simpleQuery) { combinedQuery->add(simpleQuery, BooleanClause::SHOULD); hasValidQuery = true; @@ -694,7 +629,7 @@ Lucene::QueryPtr FileNameIndexedStrategy::buildLuceneQuery(const IndexQuery &que } // 添加普通关键词查询 - QueryPtr simpleQuery = m_queryBuilder->buildSimpleQuery(query.terms.first(), query.caseSensitive, analyzer); + QueryPtr simpleQuery = m_queryBuilder->buildSimpleQuery(query.terms.first(), query.caseSensitive); if (simpleQuery) { combinedQuery->add(simpleQuery, BooleanClause::SHOULD); hasValidQuery = true; @@ -707,7 +642,7 @@ Lucene::QueryPtr FileNameIndexedStrategy::buildLuceneQuery(const IndexQuery &que break; case SearchType::Combined: if (!query.terms.isEmpty()) { - BooleanQueryPtr combinedQuery = buildBooleanTermsQuery(query, analyzer); + BooleanQueryPtr combinedQuery = buildBooleanTermsQuery(query); if (combinedQuery) { finalQuery->add(combinedQuery, BooleanClause::MUST); hasValidQuery = true; @@ -737,33 +672,54 @@ Lucene::QueryPtr FileNameIndexedStrategy::buildLuceneQuery(const IndexQuery &que // Add time range filter query if (m_options.hasTimeRangeFilter()) { TimeRangeFilter filter = m_options.timeRangeFilter(); - auto [start, end] = filter.resolveTimeRange(); + QueryPtr timeQuery = TimeRangeUtils::buildTimeRangeFilterQuery( + filter, + LuceneFieldNames::FileName::kBirthTime, + LuceneFieldNames::FileName::kModifyTime); - qint64 startEpoch = TimeRangeUtils::toEpochSecs(start); - qint64 endEpoch = TimeRangeUtils::toEpochSecs(end); + if (timeQuery) { + finalQuery->add(timeQuery, BooleanClause::MUST); + hasValidQuery = true; + } + } - const wchar_t *fieldName = (filter.timeField() == TimeField::BirthTime) - ? LuceneFieldNames::FileName::kBirthTime - : LuceneFieldNames::FileName::kModifyTime; + // Add file size range filter query + if (m_options.hasSizeRangeFilter()) { + SizeRangeFilter sizeFilter = m_options.sizeRangeFilter(); - QueryPtr timeQuery = TimeRangeUtils::buildNumericRangeQuery( - fieldName, startEpoch, endEpoch, - filter.includeLower(), filter.includeUpper()); + QueryPtr sizeQuery = TimeRangeUtils::buildNumericRangeQuery( + LuceneFieldNames::FileName::kFileSize, + sizeFilter.minSize(), sizeFilter.maxSize(), + sizeFilter.includeLower(), sizeFilter.includeUpper()); - if (timeQuery) { - finalQuery->add(timeQuery, BooleanClause::MUST); + if (sizeQuery) { + finalQuery->add(sizeQuery, BooleanClause::MUST); hasValidQuery = true; } } // Add path prefix query optimization - if (hasValidQuery && SearchUtility::isFilenameIndexAncestorPathsSupported() - && SearchUtility::shouldUsePathPrefixQuery(searchPath)) { - QueryPtr pathPrefixQuery = LuceneQueryUtils::buildPathPrefixQuery(searchPath, - QString::fromWCharArray(LuceneFieldNames::FileName::kAncestorPaths)); + QStringList searchPathsList = m_options.searchPaths(); + if (hasValidQuery) { + QueryPtr pathPrefixQuery = LuceneQueryUtils::buildMultiPathPrefixQuery( + searchPathsList, + QString::fromWCharArray(LuceneFieldNames::FileName::kAncestorPaths)); if (pathPrefixQuery) { finalQuery->add(pathPrefixQuery, BooleanClause::MUST); - qInfo() << "Using path prefix query for optimization:" << searchPath; + qInfo() << "Using multi-path prefix query for optimization:" << searchPathsList; + } + } + + // Add excluded paths filter (pushed down to query layer) + if (hasValidQuery) { + const QStringList &excludedPaths = m_options.searchExcludedPaths(); + if (!excludedPaths.isEmpty()) { + QueryPtr excludedQuery = LuceneQueryUtils::buildMultiPathPrefixQuery( + excludedPaths, + QString::fromWCharArray(LuceneFieldNames::FileName::kAncestorPaths)); + if (excludedQuery) { + finalQuery->add(excludedQuery, BooleanClause::MUST_NOT); + } } } @@ -772,14 +728,14 @@ Lucene::QueryPtr FileNameIndexedStrategy::buildLuceneQuery(const IndexQuery &que QueryPtr hiddenQuery = Lucene::newLucene( Lucene::newLucene( LuceneFieldNames::FileName::kIsHidden, - Lucene::StringUtils::toUnicode("Y"))); + L"Y")); finalQuery->add(hiddenQuery, Lucene::BooleanClause::MUST_NOT); } return hasValidQuery ? finalQuery : nullptr; } -BooleanQueryPtr FileNameIndexedStrategy::buildBooleanTermsQuery(const IndexQuery &query, const AnalyzerPtr &analyzer) const +BooleanQueryPtr FileNameIndexedStrategy::buildBooleanTermsQuery(const IndexQuery &query) const { // 创建布尔查询 BooleanQueryPtr booleanQuery = newLucene(); @@ -791,7 +747,7 @@ BooleanQueryPtr FileNameIndexedStrategy::buildBooleanTermsQuery(const IndexQuery bool termHasQuery = false; // 添加普通关键词查询 - QueryPtr keywordQuery = m_queryBuilder->buildSimpleQuery(term, query.caseSensitive, analyzer); + QueryPtr keywordQuery = m_queryBuilder->buildSimpleQuery(term, query.caseSensitive); if (keywordQuery) { termQuery->add(keywordQuery, BooleanClause::SHOULD); termHasQuery = true; diff --git a/src/dfm-search/dfm-search-lib/filenamesearch/filenamestrategies/indexedstrategy.h b/src/dfm-search/dfm-search-lib/filenamesearch/filenamestrategies/indexedstrategy.h index 2d4f143f..034ee4b8 100644 --- a/src/dfm-search/dfm-search-lib/filenamesearch/filenamestrategies/indexedstrategy.h +++ b/src/dfm-search/dfm-search-lib/filenamesearch/filenamestrategies/indexedstrategy.h @@ -83,13 +83,13 @@ class FileNameIndexedStrategy : public FileNameBaseStrategy const QStringList &fileExtensions); // 执行索引查询并处理结果 - void executeIndexQuery(const IndexQuery &query, const QString &searchPath, const QStringList &searchExcludedPaths); + void executeIndexQuery(const IndexQuery &query); // 构建 Lucene 查询 - QueryPtr buildLuceneQuery(const IndexQuery &query, const QString &searchPath) const; + QueryPtr buildLuceneQuery(const IndexQuery &query) const; // 构建布尔查询的辅助方法 - BooleanQueryPtr buildBooleanTermsQuery(const IndexQuery &query, const AnalyzerPtr &analyzer) const; + BooleanQueryPtr buildBooleanTermsQuery(const IndexQuery &query) const; // 处理详细搜索结果(读取所有索引字段) SearchResult processDetailedSearchResult(const QString &path, const Lucene::DocumentPtr &doc); @@ -116,14 +116,8 @@ class QueryBuilder QueryPtr buildExtQuery(const QStringList &extensions) const; QueryPtr buildPinyinQuery(const QStringList &pinyins, SearchQuery::BooleanOperator op = SearchQuery::BooleanOperator::AND) const; QueryPtr buildPinyinAcronymQuery(const QStringList &acronyms, SearchQuery::BooleanOperator op = SearchQuery::BooleanOperator::AND) const; - QueryPtr buildBooleanQuery(const QStringList &terms, bool caseSensitive, SearchQuery::BooleanOperator op, const Lucene::AnalyzerPtr &analyzer) const; - QueryPtr buildWildcardQuery(const QString &keyword, bool caseSensitive, const Lucene::AnalyzerPtr &analyzer) const; - QueryPtr buildSimpleQuery(const QString &keyword, bool caseSensitive, const Lucene::AnalyzerPtr &analyzer) const; - -private: - // 通用的查询构建方法 - QueryPtr buildCommonQuery(const QString &keyword, bool caseSensitive, const Lucene::AnalyzerPtr &analyzer, bool allowWildcard = false) const; - QueryPtr buildCommonQuery(const QString &keyword, bool caseSensitive, const Lucene::AnalyzerPtr &analyzer, const QString &fieldName, bool allowWildcard = false) const; + QueryPtr buildWildcardQuery(const QString &keyword, bool caseSensitive) const; + QueryPtr buildSimpleQuery(const QString &keyword, bool caseSensitive) const; }; /** diff --git a/src/dfm-search/dfm-search-lib/filenamesearch/filenamestrategies/realtimestrategy.cpp b/src/dfm-search/dfm-search-lib/filenamesearch/filenamestrategies/realtimestrategy.cpp index dd0ff80f..e5c70609 100644 --- a/src/dfm-search/dfm-search-lib/filenamesearch/filenamestrategies/realtimestrategy.cpp +++ b/src/dfm-search/dfm-search-lib/filenamesearch/filenamestrategies/realtimestrategy.cpp @@ -12,6 +12,7 @@ #include #include +#include DFM_SEARCH_BEGIN_NS @@ -119,9 +120,9 @@ void FileNameRealTimeStrategy::search(const SearchQuery &query) QString fileName = info.fileName(); bool matches = false; - // 如果只有时间过滤没有关键词,直接匹配 + // 如果只有过滤条件(时间/大小)没有关键词,直接匹配 bool hasKeyword = !query.keyword().isEmpty() || query.type() == SearchQuery::Type::Boolean; - if (!hasKeyword && m_options.hasTimeRangeFilter()) { + if (!hasKeyword && (m_options.hasTimeRangeFilter() || m_options.hasSizeRangeFilter())) { matches = true; } // 简单查询模式 @@ -174,6 +175,29 @@ void FileNameRealTimeStrategy::search(const SearchQuery &query) matches = timeMatch; } + // 文件大小范围过滤 + if (matches && m_options.hasSizeRangeFilter()) { + SizeRangeFilter sizeFilter = m_options.sizeRangeFilter(); + qint64 fileSize = info.size(); + + bool sizeMatch = true; + if (sizeFilter.minSize() > 0) { + if (sizeFilter.includeLower()) { + sizeMatch = sizeMatch && (fileSize >= sizeFilter.minSize()); + } else { + sizeMatch = sizeMatch && (fileSize > sizeFilter.minSize()); + } + } + if (sizeFilter.maxSize() > 0) { + if (sizeFilter.includeUpper()) { + sizeMatch = sizeMatch && (fileSize <= sizeFilter.maxSize()); + } else { + sizeMatch = sizeMatch && (fileSize < sizeFilter.maxSize()); + } + } + matches = sizeMatch; + } + if (matches) { // 创建搜索结果 SearchResult result(info.filePath()); @@ -186,6 +210,7 @@ void FileNameRealTimeStrategy::search(const SearchQuery &query) api.setFileType(info.suffix().isEmpty() ? "unknown" : info.suffix().toLower()); api.setFileExtension(info.suffix().toLower()); api.setSize(QString::number(info.size())); + api.setFileSizeBytes(info.size()); } else { api.setFileType("dir"); } diff --git a/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextsearchapi.cpp b/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextsearchapi.cpp index 30e3f8c8..6ad7bb32 100644 --- a/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextsearchapi.cpp +++ b/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextsearchapi.cpp @@ -44,4 +44,14 @@ void OcrTextResultAPI::setOcrContent(const QString &content) m_result.setCustomAttribute("ocrContent", content); } +QString OcrTextResultAPI::checksum() const +{ + return m_result.customAttribute("checksum").toString(); +} + +void OcrTextResultAPI::setChecksum(const QString &checksum) +{ + m_result.setCustomAttribute("checksum", checksum); +} + DFM_SEARCH_END_NS diff --git a/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextsearchengine.cpp b/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextsearchengine.cpp index ed9172bb..476bd8ce 100644 --- a/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextsearchengine.cpp +++ b/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextsearchengine.cpp @@ -3,6 +3,8 @@ // SPDX-License-Identifier: GPL-3.0-or-later #include "ocrtextsearchengine.h" +#include + #include "ocrtextstrategies/indexedstrategy.h" DFM_SEARCH_BEGIN_NS @@ -40,8 +42,10 @@ SearchError OcrTextSearchEngine::validateSearchConditions() return SearchError(OcrTextSearchErrorCode::WildcardNotSupported); } + OcrTextOptionsAPI optAPI(m_options); if (m_currentQuery.type() == SearchQuery::Type::Simple - && m_currentQuery.keyword().toUtf8().size() < Global::kMinContentSearchKeywordLength) { + && m_currentQuery.keyword().toUtf8().size() < Global::kMinContentSearchKeywordLength + && optAPI.filenameKeyword().isEmpty()) { return SearchError(OcrTextSearchErrorCode::KeywordTooShort); } diff --git a/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.cpp b/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.cpp index e561b2a6..5361134e 100644 --- a/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.cpp +++ b/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.cpp @@ -9,20 +9,18 @@ #include #include -#include #include #include #include +#include #include #include #include -#include "3rdparty/fulltext/chineseanalyzer.h" #include "utils/cancellablecollector.h" #include "utils/contenthighlighter.h" #include "utils/lucenequeryutils.h" -#include "utils/searchutility.h" #include "utils/lucene_cancellation_compat.h" #include "utils/timerangeutils.h" @@ -63,21 +61,16 @@ void OcrTextIndexedStrategy::search(const SearchQuery &query) } } -Lucene::QueryPtr OcrTextIndexedStrategy::buildLuceneQuery(const SearchQuery &query, const Lucene::AnalyzerPtr &analyzer, const QString &searchPath) +Lucene::QueryPtr OcrTextIndexedStrategy::buildLuceneQuery(const SearchQuery &query) { try { m_keywords.clear(); OcrTextOptionsAPI optAPI(m_options); bool mixedAndEnabled = optAPI.isFilenameOcrContentMixedAndSearchEnabled(); - Lucene::QueryParserPtr ocrContentsParser = newLucene( - Lucene::LuceneVersion::LUCENE_CURRENT, - LuceneFieldNames::OcrText::kOcrContents, - analyzer); - Lucene::QueryPtr mainQuery; if (query.type() == SearchQuery::Type::Simple) { - mainQuery = buildSimpleOcrContentsQuery(query, ocrContentsParser); + mainQuery = buildSimpleOcrContentsQuery(query); } else if (query.type() == SearchQuery::Type::Boolean) { if (query.subQueries().isEmpty()) { // For an empty boolean query, match nothing. @@ -86,10 +79,10 @@ Lucene::QueryPtr OcrTextIndexedStrategy::buildLuceneQuery(const SearchQuery &que // Determine which logic path to take for boolean queries if (mixedAndEnabled && query.booleanOperator() == SearchQuery::BooleanOperator::AND) { // New "advanced" AND logic for ocr_contents/filename - mainQuery = buildAdvancedAndQuery(query, ocrContentsParser, analyzer); + mainQuery = buildAdvancedAndQuery(query); } else { // "Standard" ocr_contents-only logic - mainQuery = buildStandardBooleanOcrContentsQuery(query, ocrContentsParser); + mainQuery = buildStandardBooleanOcrContentsQuery(query); } } } else { @@ -97,35 +90,85 @@ Lucene::QueryPtr OcrTextIndexedStrategy::buildLuceneQuery(const SearchQuery &que mainQuery = newLucene(); // Should not happen } + // Add filename keyword query (before filters, so it replaces empty content query correctly) + QString filenameKw = optAPI.filenameKeyword(); + if (!filenameKw.isEmpty()) { + Lucene::QueryPtr filenameQuery = LuceneQueryUtils::buildNGramSearchQuery( + QString::fromWCharArray(LuceneFieldNames::OcrText::kFilename), + filenameKw); + + if (filenameQuery) { + // Check if content keywords are effectively empty + bool noContentKeywords = (query.type() == SearchQuery::Type::Simple) + ? query.keyword().isEmpty() + : (query.subQueries().isEmpty() + || std::all_of(query.subQueries().cbegin(), query.subQueries().cend(), + [](const auto &sq) { return sq.keyword().isEmpty(); })); + + if (noContentKeywords) { + // Filename-only search: replace empty content query with filename query + mainQuery = filenameQuery; + } else if (mainQuery) { + // Both content and filename: AND combination + BooleanQueryPtr finalQuery = newLucene(); + finalQuery->add(mainQuery, BooleanClause::MUST); + finalQuery->add(filenameQuery, BooleanClause::MUST); + mainQuery = finalQuery; + } + m_keywords.append(filenameKw); + } + } + // Add path prefix query optimization - if (mainQuery && SearchUtility::isOcrTextIndexAncestorPathsSupported() - && SearchUtility::shouldUsePathPrefixQuery(searchPath)) { - QueryPtr pathPrefixQuery = LuceneQueryUtils::buildPathPrefixQuery(searchPath, - QString::fromWCharArray(LuceneFieldNames::OcrText::kAncestorPaths)); + QStringList searchPathsList = m_options.searchPaths(); + if (mainQuery) { + QueryPtr pathPrefixQuery = LuceneQueryUtils::buildMultiPathPrefixQuery( + searchPathsList, + QString::fromWCharArray(LuceneFieldNames::OcrText::kAncestorPaths)); if (pathPrefixQuery) { BooleanQueryPtr finalQuery = newLucene(); finalQuery->add(mainQuery, BooleanClause::MUST); finalQuery->add(pathPrefixQuery, BooleanClause::MUST); - qInfo() << "Using path prefix query for OCR text search optimization:" << searchPath; + qInfo() << "Using multi-path prefix query for OCR text search optimization:" << searchPathsList; mainQuery = finalQuery; } } + // Add excluded paths filter (pushed down to query layer) + if (mainQuery) { + const QStringList &excludedPaths = m_options.searchExcludedPaths(); + if (!excludedPaths.isEmpty()) { + QueryPtr excludedQuery = LuceneQueryUtils::buildMultiPathPrefixQuery( + excludedPaths, + QString::fromWCharArray(LuceneFieldNames::OcrText::kAncestorPaths)); + if (excludedQuery) { + BooleanQueryPtr finalQuery = newLucene(); + finalQuery->add(mainQuery, BooleanClause::MUST); + finalQuery->add(excludedQuery, BooleanClause::MUST_NOT); + mainQuery = finalQuery; + } + } + } + + // Add hidden file filter (pushed down to query layer) + if (mainQuery && !m_options.includeHidden()) { + QueryPtr hiddenQuery = Lucene::newLucene( + Lucene::newLucene( + LuceneFieldNames::OcrText::kIsHidden, + L"Y")); + BooleanQueryPtr finalQuery = newLucene(); + finalQuery->add(mainQuery, BooleanClause::MUST); + finalQuery->add(hiddenQuery, BooleanClause::MUST_NOT); + mainQuery = finalQuery; + } + // Add time range filter query if (m_options.hasTimeRangeFilter()) { TimeRangeFilter filter = m_options.timeRangeFilter(); - auto [start, end] = filter.resolveTimeRange(); - - qint64 startEpoch = TimeRangeUtils::toEpochSecs(start); - qint64 endEpoch = TimeRangeUtils::toEpochSecs(end); - - const wchar_t *fieldName = (filter.timeField() == TimeField::BirthTime) - ? LuceneFieldNames::OcrText::kBirthTime - : LuceneFieldNames::OcrText::kModifyTime; - - QueryPtr timeQuery = TimeRangeUtils::buildNumericRangeQuery( - fieldName, startEpoch, endEpoch, - filter.includeLower(), filter.includeUpper()); + QueryPtr timeQuery = TimeRangeUtils::buildTimeRangeFilterQuery( + filter, + LuceneFieldNames::OcrText::kBirthTime, + LuceneFieldNames::OcrText::kModifyTime); if (timeQuery) { if (mainQuery) { @@ -140,6 +183,27 @@ Lucene::QueryPtr OcrTextIndexedStrategy::buildLuceneQuery(const SearchQuery &que } } + // Add file size range filter query + if (m_options.hasSizeRangeFilter()) { + SizeRangeFilter sizeFilter = m_options.sizeRangeFilter(); + QueryPtr sizeQuery = TimeRangeUtils::buildNumericRangeQuery( + LuceneFieldNames::OcrText::kFileSize, + sizeFilter.minSize(), sizeFilter.maxSize(), + sizeFilter.includeLower(), sizeFilter.includeUpper()); + + if (sizeQuery) { + if (mainQuery) { + BooleanQueryPtr finalQuery = newLucene(); + finalQuery->add(mainQuery, BooleanClause::MUST); + finalQuery->add(sizeQuery, BooleanClause::MUST); + mainQuery = finalQuery; + } else { + // Size filter alone is a valid query + mainQuery = sizeQuery; + } + } + } + return mainQuery; } catch (const Lucene::LuceneException &e) { @@ -151,15 +215,9 @@ Lucene::QueryPtr OcrTextIndexedStrategy::buildLuceneQuery(const SearchQuery &que } } -QueryPtr OcrTextIndexedStrategy::buildAdvancedAndQuery(const SearchQuery &query, const Lucene::QueryParserPtr &ocrContentsParser, const Lucene::AnalyzerPtr &analyzer) +QueryPtr OcrTextIndexedStrategy::buildAdvancedAndQuery(const SearchQuery &query) { // This method implements the "mixed" AND logic similar to content search. - // It requires its own filenameParser. - Lucene::QueryParserPtr filenameParser = newLucene( - Lucene::LuceneVersion::LUCENE_CURRENT, - LuceneFieldNames::OcrText::kFilename, - analyzer); - Lucene::BooleanQueryPtr overallQuery = newLucene(); Lucene::BooleanQueryPtr mainAndClausesQuery = newLucene(); Lucene::BooleanQueryPtr allOcrContentsQuery = newLucene(); @@ -173,10 +231,12 @@ QueryPtr OcrTextIndexedStrategy::buildAdvancedAndQuery(const SearchQuery &query, } hasValidKeywords = true; - // Use LuceneQueryUtils to process special characters - Lucene::String processedKeyword = LuceneQueryUtils::processQueryString(subQuery.keyword(), false); - Lucene::QueryPtr ocrContentsTermQuery = ocrContentsParser->parse(processedKeyword); - Lucene::QueryPtr filenameTermQuery = filenameParser->parse(processedKeyword); + Lucene::QueryPtr ocrContentsTermQuery = LuceneQueryUtils::buildNGramSearchQuery( + QString::fromWCharArray(LuceneFieldNames::OcrText::kOcrContents), + subQuery.keyword()); + Lucene::QueryPtr filenameTermQuery = LuceneQueryUtils::buildNGramSearchQuery( + QString::fromWCharArray(LuceneFieldNames::OcrText::kFilename), + subQuery.keyword()); // Build (ocr_contents:keyword OR filename:keyword) Lucene::BooleanQueryPtr combinedTermQuery = newLucene(); @@ -205,7 +265,7 @@ QueryPtr OcrTextIndexedStrategy::buildAdvancedAndQuery(const SearchQuery &query, return overallQuery; } -QueryPtr OcrTextIndexedStrategy::buildStandardBooleanOcrContentsQuery(const SearchQuery &query, const Lucene::QueryParserPtr &ocrContentsParser) +QueryPtr OcrTextIndexedStrategy::buildStandardBooleanOcrContentsQuery(const SearchQuery &query) { // This method implements the "original" boolean logic, searching only "ocr_contents". Lucene::BooleanQueryPtr booleanQuery = newLucene(); @@ -216,8 +276,9 @@ QueryPtr OcrTextIndexedStrategy::buildStandardBooleanOcrContentsQuery(const Sear continue; // Skip empty keywords } - // Use LuceneQueryUtils to process special characters - Lucene::QueryPtr termQuery = ocrContentsParser->parse(LuceneQueryUtils::processQueryString(subQuery.keyword(), false)); + Lucene::QueryPtr termQuery = LuceneQueryUtils::buildNGramSearchQuery( + QString::fromWCharArray(LuceneFieldNames::OcrText::kOcrContents), + subQuery.keyword()); booleanQuery->add(termQuery, query.booleanOperator() == SearchQuery::BooleanOperator::AND ? Lucene::BooleanClause::MUST : Lucene::BooleanClause::SHOULD); } @@ -225,14 +286,15 @@ QueryPtr OcrTextIndexedStrategy::buildStandardBooleanOcrContentsQuery(const Sear return booleanQuery; } -QueryPtr OcrTextIndexedStrategy::buildSimpleOcrContentsQuery(const SearchQuery &query, const Lucene::QueryParserPtr &ocrContentsParser) +QueryPtr OcrTextIndexedStrategy::buildSimpleOcrContentsQuery(const SearchQuery &query) { m_keywords.append(query.keyword()); if (query.keyword().isEmpty()) { return newLucene(); // Match nothing for empty keyword } - // Use LuceneQueryUtils to process special characters - return ocrContentsParser->parse(LuceneQueryUtils::processQueryString(query.keyword(), false)); + return LuceneQueryUtils::buildNGramSearchQuery( + QString::fromWCharArray(LuceneFieldNames::OcrText::kOcrContents), + query.keyword()); } void OcrTextIndexedStrategy::processSearchResults(const Lucene::IndexSearcherPtr &searcher, @@ -242,14 +304,34 @@ void OcrTextIndexedStrategy::processSearchResults(const Lucene::IndexSearcherPtr QElapsedTimer resultTimer; resultTimer.start(); - QString searchPath = m_options.searchPath(); - const QStringList &searchExcludedPaths = m_options.searchExcludedPaths(); auto docsSize = scoreDocs.size(); OcrTextOptionsAPI optAPI(m_options); bool enableHTML = optAPI.isSearchResultHighlightEnabled(); int previewLen = optAPI.maxPreviewLength() > 0 ? optAPI.maxPreviewLength() : 50; bool enableRetrieval = optAPI.isFullTextRetrievalEnabled(); + bool detailedResults = m_options.detailedResultsEnabled(); + + // Build field selector to avoid loading the large 'ocr_contents' field when not needed. + // The ocr_contents field stores OCR-recognized text and loading it for every result + // (even when only path is needed) causes significant disk I/O overhead. + Lucene::Collection fieldsToLoad = Lucene::Collection::newInstance(); + if (enableRetrieval) { + fieldsToLoad.add(LuceneFieldNames::OcrText::kOcrContents); + } + fieldsToLoad.add(LuceneFieldNames::OcrText::kPath); + if (Q_UNLIKELY(detailedResults)) { + fieldsToLoad.add(LuceneFieldNames::OcrText::kFilename); + fieldsToLoad.add(LuceneFieldNames::OcrText::kIsHidden); + fieldsToLoad.add(LuceneFieldNames::OcrText::kModifyTime); + fieldsToLoad.add(LuceneFieldNames::OcrText::kBirthTime); + fieldsToLoad.add(LuceneFieldNames::OcrText::kCheckSum); + fieldsToLoad.add(LuceneFieldNames::OcrText::kFileSize); + } + Lucene::FieldSelectorPtr fieldSelector = newLucene(fieldsToLoad); + + // Pre-allocate to avoid reallocation during append + m_results.reserve(m_results.size() + static_cast(docsSize)); for (int32_t i = 0; i < docsSize; ++i) { if (m_cancelled.load()) { @@ -259,21 +341,14 @@ void OcrTextIndexedStrategy::processSearchResults(const Lucene::IndexSearcherPtr try { Lucene::ScoreDocPtr scoreDoc = scoreDocs[i]; - if (!scoreDoc) { - qWarning() << "Null ScoreDoc encountered at index" << i; - continue; - } - - // Defensive check: verify document ID is valid - if (scoreDoc->doc < 0) { - qWarning() << "Invalid document ID:" << scoreDoc->doc; + if (!scoreDoc || scoreDoc->doc < 0) { + qWarning() << "Invalid ScoreDoc at index" << i; continue; } - // Safely retrieve document (could throw if index is corrupted) Lucene::DocumentPtr doc; try { - doc = searcher->doc(scoreDoc->doc); + doc = searcher->doc(scoreDoc->doc, fieldSelector); if (!doc) { qWarning() << "Failed to retrieve document at index:" << scoreDoc->doc; continue; @@ -286,45 +361,14 @@ void OcrTextIndexedStrategy::processSearchResults(const Lucene::IndexSearcherPtr continue; } - // Safely get path - Lucene::String pathField; - try { - pathField = doc->get(LuceneFieldNames::OcrText::kPath); - if (pathField.empty()) { - qWarning() << "Document missing path field at index:" << scoreDoc->doc; - continue; - } - } catch (const std::exception &e) { - qWarning() << "Exception retrieving path field:" << e.what(); - continue; - } - - QString path = QString::fromStdWString(pathField); - - if (!path.startsWith(searchPath)) { - continue; - } - - if (std::any_of(searchExcludedPaths.cbegin(), searchExcludedPaths.cend(), - [&path](const auto &excluded) { return path.startsWith(excluded); })) { + // Path filtering, excluded paths, hidden file — handled at query layer + Lucene::String pathField = doc->get(LuceneFieldNames::OcrText::kPath); + if (pathField.empty()) { + qWarning() << "Document missing path field at index:" << scoreDoc->doc; continue; } - // Safely check hidden status - if (Q_LIKELY(!m_options.includeHidden())) { - try { - Lucene::String hiddenField = doc->get(LuceneFieldNames::OcrText::kIsHidden); - if (!hiddenField.empty() && QString::fromStdWString(hiddenField).toLower() == "y") { - continue; - } - } catch (const std::exception &e) { - qWarning() << "Exception retrieving is_hidden field:" << e.what(); - // Default to visible if field can't be read - } - } - - // Create search result - SearchResult result(path); + SearchResult result(QString::fromStdWString(pathField)); // 设置 OCR 内容结果 OcrTextResultAPI resultApi(result); @@ -332,7 +376,6 @@ void OcrTextIndexedStrategy::processSearchResults(const Lucene::IndexSearcherPtr // 使用ContentHighlighter命名空间进行高亮 if (enableRetrieval) { try { - // Safely get OCR contents with null check Lucene::String ocrContentField = doc->get(LuceneFieldNames::OcrText::kOcrContents); if (!ocrContentField.empty()) { const QString content = QString::fromStdWString(ocrContentField); @@ -340,20 +383,18 @@ void OcrTextIndexedStrategy::processSearchResults(const Lucene::IndexSearcherPtr resultApi.setOcrContent(content); // 设置高亮内容 const QString highlightedContent = ContentHighlighter::customHighlight( - m_keywords, content, previewLen, enableHTML); + m_keywords, content, previewLen, enableHTML); resultApi.setHighlightedContent(highlightedContent); } } catch (const Lucene::LuceneException &e) { qWarning() << "Exception retrieving OCR content field:" << QString::fromStdWString(e.getError()); - // Continue without content highlight } catch (const std::exception &e) { qWarning() << "Standard exception retrieving OCR content field:" << e.what(); - // Continue without content highlight } } // 设置详细结果(如果启用) - if (Q_UNLIKELY(m_options.detailedResultsEnabled())) { + if (Q_UNLIKELY(detailedResults)) { // 文件名 Lucene::String filenameField = doc->get(LuceneFieldNames::OcrText::kFilename); if (!filenameField.empty()) { @@ -385,14 +426,30 @@ void OcrTextIndexedStrategy::processSearchResults(const Lucene::IndexSearcherPtr resultApi.setBirthTimestamp(timestamp); } } + + // 文件校验和 + Lucene::String checksumField = doc->get(LuceneFieldNames::OcrText::kCheckSum); + if (!checksumField.empty()) { + resultApi.setChecksum(QString::fromStdWString(checksumField)); + } + + // 文件大小 + Lucene::String fileSizeField = doc->get(LuceneFieldNames::OcrText::kFileSize); + if (!fileSizeField.empty()) { + bool ok = false; + qint64 fileSize = QString::fromStdWString(fileSizeField).toLongLong(&ok); + if (ok && fileSize > 0) { + resultApi.setFileSizeBytes(fileSize); + } + } } // Add to result collection - m_results.append(result); + m_results.append(std::move(result)); // Real-time result emission if (Q_UNLIKELY(m_options.resultFoundEnabled())) - emit resultFound(result); + emit resultFound(m_results.last()); } catch (const Lucene::LuceneException &e) { qWarning() << "Error processing result:" << QString::fromStdWString(e.getError()); @@ -435,11 +492,8 @@ void OcrTextIndexedStrategy::performOcrTextSearch(const SearchQuery &query) // Create searcher IndexSearcherPtr searcher = newLucene(reader); - // Create analyzer (reuse ChineseAnalyzer for OCR text) - AnalyzerPtr analyzer = newLucene(); - // Build query - m_currentQuery = buildLuceneQuery(query, analyzer, m_options.searchPath()); + m_currentQuery = buildLuceneQuery(query); if (!m_currentQuery) { qWarning() << "Failed to build Lucene query for OCR text search"; emit errorOccurred(SearchError(OcrTextSearchErrorCode::OcrTextIndexException)); diff --git a/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.h b/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.h index b09aead1..d75ef469 100644 --- a/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.h +++ b/src/dfm-search/dfm-search-lib/ocrtextsearch/ocrtextstrategies/indexedstrategy.h @@ -7,7 +7,6 @@ #include "basestrategy.h" #include -#include #include #include #include @@ -40,23 +39,16 @@ class OcrTextIndexedStrategy : public OcrTextBaseStrategy void performOcrTextSearch(const SearchQuery &query); // Build Lucene query - Lucene::QueryPtr buildLuceneQuery(const SearchQuery &query, const Lucene::AnalyzerPtr &analyzer, const QString &searchPath); + Lucene::QueryPtr buildLuceneQuery(const SearchQuery &query); // Helper for simple queries - Lucene::QueryPtr buildSimpleOcrContentsQuery( - const SearchQuery &query, - const Lucene::QueryParserPtr &ocrContentsParser); + Lucene::QueryPtr buildSimpleOcrContentsQuery(const SearchQuery &query); // Helper for "standard" boolean logic - Lucene::QueryPtr buildStandardBooleanOcrContentsQuery( - const SearchQuery &query, - const Lucene::QueryParserPtr &ocrContentsParser); + Lucene::QueryPtr buildStandardBooleanOcrContentsQuery(const SearchQuery &query); // Helper for "advanced" mixed AND logic (searches "ocr_contents" and "filename") - Lucene::QueryPtr buildAdvancedAndQuery( - const SearchQuery &query, - const Lucene::QueryParserPtr &ocrContentsParser, - const Lucene::AnalyzerPtr &analyzer); + Lucene::QueryPtr buildAdvancedAndQuery(const SearchQuery &query); // Process search results void processSearchResults(const Lucene::IndexSearcherPtr &searcher, diff --git a/src/dfm-search/dfm-search-lib/semantic/extractors/actionextractor.cpp b/src/dfm-search/dfm-search-lib/semantic/extractors/actionextractor.cpp new file mode 100644 index 00000000..52ed77c9 --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/extractors/actionextractor.cpp @@ -0,0 +1,56 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "actionextractor.h" + +#include "semantic/semanticruleengine.h" + +#include + +DFM_SEARCH_BEGIN_NS + +ActionExtractor::ActionExtractor(SemanticRuleEngine *engine) + : m_engine(engine) +{ +} + +ActionExtractor::~ActionExtractor() = default; + +void ActionExtractor::extract(const QString &input, ParsedIntent &intent) +{ + if (!m_engine->hasGroup("action")) { + return; + } + + QString ruleId; + QRegularExpressionMatch match; + if (!m_engine->match("action", input, match, &ruleId)) { + return; + } + + const QVariantMap metadata = m_engine->ruleMetadata("action", ruleId); + const QString timeFieldStr = metadata.value("time_field").toString(); + + if (timeFieldStr == "birth") { + intent.timeConstraint.timeField = TimeField::BirthTime; + } else if (timeFieldStr == "modify") { + intent.timeConstraint.timeField = TimeField::ModifyTime; + } else { + qWarning() << "Unknown time_field in action rule:" << timeFieldStr; + return; + } + + MatchSpan span; + span.start = match.capturedStart(); + span.end = match.capturedEnd(); + span.ruleId = ruleId; + intent.consumedSpans.append(span); +} + +QString ActionExtractor::name() const +{ + return QStringLiteral("action"); +} + +DFM_SEARCH_END_NS diff --git a/src/dfm-search/dfm-search-lib/semantic/extractors/actionextractor.h b/src/dfm-search/dfm-search-lib/semantic/extractors/actionextractor.h new file mode 100644 index 00000000..b11ece8a --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/extractors/actionextractor.h @@ -0,0 +1,29 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef ACTIONEXTRACTOR_H +#define ACTIONEXTRACTOR_H + +#include + +DFM_SEARCH_BEGIN_NS + +class SemanticRuleEngine; + +class ActionExtractor : public DimensionExtractor +{ +public: + explicit ActionExtractor(SemanticRuleEngine *engine); + ~ActionExtractor() override; + + void extract(const QString &input, ParsedIntent &intent) override; + QString name() const override; + +private: + SemanticRuleEngine *m_engine; +}; + +DFM_SEARCH_END_NS + +#endif // ACTIONEXTRACTOR_H diff --git a/src/dfm-search/dfm-search-lib/semantic/extractors/filetypeextractor.cpp b/src/dfm-search/dfm-search-lib/semantic/extractors/filetypeextractor.cpp new file mode 100644 index 00000000..b52d876d --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/extractors/filetypeextractor.cpp @@ -0,0 +1,63 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "filetypeextractor.h" + +#include "semantic/semanticruleengine.h" + +DFM_SEARCH_BEGIN_NS + +FileTypeExtractor::FileTypeExtractor(SemanticRuleEngine *engine) + : m_engine(engine) +{ +} + +FileTypeExtractor::~FileTypeExtractor() = default; + +void FileTypeExtractor::extract(const QString &input, ParsedIntent &intent) +{ + if (!m_engine->hasGroup("filetype")) { + return; + } + + QStringList ruleIds; + const QList matches = m_engine->matchAll("filetype", input, &ruleIds); + + QSet seenExtensions; + + for (int i = 0; i < matches.size(); ++i) { + const QRegularExpressionMatch &m = matches[i]; + const QVariantMap metadata = m_engine->ruleMetadata("filetype", ruleIds[i]); + + const QStringList extensions = metadata.value("extensions").toStringList(); + const bool isGeneral = metadata.value("general", false).toBool(); + + // If this is a general/fallback type but we already have specific extensions, + // skip to avoid over-specificity dilution + if (isGeneral && !seenExtensions.isEmpty()) { + continue; + } + + for (const QString &ext : extensions) { + if (!seenExtensions.contains(ext)) { + seenExtensions.insert(ext); + } + } + + MatchSpan span; + span.start = m.capturedStart(); + span.end = m.capturedEnd(); + span.ruleId = ruleIds[i]; + intent.consumedSpans.append(span); + } + + intent.fileExtensions = seenExtensions.values(); +} + +QString FileTypeExtractor::name() const +{ + return QStringLiteral("filetype"); +} + +DFM_SEARCH_END_NS diff --git a/src/dfm-search/dfm-search-lib/semantic/extractors/filetypeextractor.h b/src/dfm-search/dfm-search-lib/semantic/extractors/filetypeextractor.h new file mode 100644 index 00000000..1f50faf1 --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/extractors/filetypeextractor.h @@ -0,0 +1,29 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef FILETYPEEXTRACTOR_H +#define FILETYPEEXTRACTOR_H + +#include + +DFM_SEARCH_BEGIN_NS + +class SemanticRuleEngine; + +class FileTypeExtractor : public DimensionExtractor +{ +public: + explicit FileTypeExtractor(SemanticRuleEngine *engine); + ~FileTypeExtractor() override; + + void extract(const QString &input, ParsedIntent &intent) override; + QString name() const override; + +private: + SemanticRuleEngine *m_engine; +}; + +DFM_SEARCH_END_NS + +#endif // FILETYPEEXTRACTOR_H diff --git a/src/dfm-search/dfm-search-lib/semantic/extractors/keywordextractor.cpp b/src/dfm-search/dfm-search-lib/semantic/extractors/keywordextractor.cpp new file mode 100644 index 00000000..35af0891 --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/extractors/keywordextractor.cpp @@ -0,0 +1,208 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "keywordextractor.h" + +#include "semantic/semanticruleengine.h" + +#include + +DFM_SEARCH_BEGIN_NS + +KeywordExtractor::KeywordExtractor(SemanticRuleEngine *engine) + : m_engine(engine) +{ +} + +KeywordExtractor::~KeywordExtractor() = default; + +void KeywordExtractor::extract(const QString &input, ParsedIntent &intent) +{ + // Strategy 1: structured keyword patterns (e.g., "contains X and Y") + if (extractStructuredKeywords(input, intent)) { + return; + } + + // Strategy 2: extract unconsumed text regions + extractUnconsumedText(input, intent); +} + +bool KeywordExtractor::extractStructuredKeywords(const QString &input, ParsedIntent &intent) +{ + if (!m_engine->hasGroup("keyword")) { + return false; + } + + QString ruleId; + QRegularExpressionMatch match; + if (!m_engine->match("keyword", input, match, &ruleId)) { + return false; + } + + const QVariantMap metadata = m_engine->ruleMetadata("keyword", ruleId); + const int captureGroup = metadata.value("capture_group", 1).toInt(); + + if (captureGroup <= 0 || captureGroup > match.lastCapturedIndex()) { + return false; + } + + QString captured = match.captured(captureGroup).trimmed(); + if (captured.isEmpty()) { + return false; + } + + const bool multiKeyword = metadata.value("multi_keyword", false).toBool(); + + if (multiKeyword) { + intent.keywords = splitMultiKeywords(captured, metadata); + } else { + intent.keywords = { captured }; + } + + // Determine search target from rule metadata + const QString targetStr = metadata.value("search_target").toString(); + if (targetStr == "filename") { + intent.searchTarget = SearchTarget::FileNameOnly; + } else if (targetStr == "content") { + intent.searchTarget = SearchTarget::ContentOnly; + } + // "all" or empty → keep default SearchTarget::All + + // Mark the entire matched region as consumed + MatchSpan span; + span.start = match.capturedStart(); + span.end = match.capturedEnd(); + span.ruleId = ruleId; + intent.consumedSpans.append(span); + + return true; +} + +void KeywordExtractor::extractUnconsumedText(const QString &input, ParsedIntent &intent) +{ + QList allSpans = intent.consumedSpans; + + // Also consume noise words + if (m_engine->hasGroup("noise")) { + QStringList noiseRuleIds; + const QList noiseMatches = + m_engine->matchAll("noise", input, &noiseRuleIds); + + for (int i = 0; i < noiseMatches.size(); ++i) { + MatchSpan span; + span.start = noiseMatches[i].capturedStart(); + span.end = noiseMatches[i].capturedEnd(); + span.ruleId = noiseRuleIds[i]; + allSpans.append(span); + } + } + + // Extract text not covered by any consumed span + const QString unconsumed = extractUnconsumedRegions(input, allSpans); + + if (unconsumed.isEmpty()) { + return; + } + + // Clean up punctuation and particles using pattern from rule metadata + // Default: strip whitespace + const QString cleanupPattern = QStringLiteral("[\\s]+"); + QRegularExpression cleanupRe(cleanupPattern); + + // Try to get a more specific cleanup pattern from keyword rules + // Load from ALL rules in the group (not just matching ones), + // since cleanup_pattern is a configuration property, not a per-match property. + if (m_engine->hasGroup("keyword")) { + const QStringList allRuleIds = m_engine->ruleIds("keyword"); + for (const QString &rid : allRuleIds) { + const QVariantMap meta = m_engine->ruleMetadata("keyword", rid); + const QString pattern = meta.value("cleanup_pattern").toString(); + if (!pattern.isEmpty()) { + cleanupRe.setPattern(pattern); + break; + } + } + } + + const QString cleaned = unconsumed.trimmed() + .replace(cleanupRe, " ") + .simplified(); + + if (cleaned.isEmpty()) { + return; + } + + intent.keywords = { cleaned }; +} + +QString KeywordExtractor::extractUnconsumedRegions(const QString &input, const QList &allSpans) const +{ + if (input.isEmpty()) { + return {}; + } + + // Build a set of consumed character positions + QVector consumed(input.size(), false); + for (const MatchSpan &span : allSpans) { + if (span.isValid() && span.end <= input.size()) { + for (int i = span.start; i < span.end; ++i) { + consumed[i] = true; + } + } + } + + // Extract unconsumed regions + QString result; + int regionStart = -1; + + for (int i = 0; i < input.size(); ++i) { + if (!consumed[i]) { + if (regionStart < 0) { + regionStart = i; + } + } else { + if (regionStart >= 0) { + result += input.mid(regionStart, i - regionStart) + " "; + regionStart = -1; + } + } + } + + // Trailing region + if (regionStart >= 0) { + result += input.mid(regionStart); + } + + return result.trimmed(); +} + +QStringList KeywordExtractor::splitMultiKeywords(const QString &text, const QVariantMap &metadata) +{ + // Default split on comma + QString splitPattern = QStringLiteral("[,]+"); + + // Try to get language-specific split pattern from metadata + const QString metaSplit = metadata.value("split_pattern").toString(); + if (!metaSplit.isEmpty()) { + splitPattern = metaSplit; + } + + QRegularExpression splitRe(splitPattern); + const QStringList parts = text.split(splitRe, Qt::SkipEmptyParts); + QStringList result; + for (const QString &part : parts) { + const QString trimmed = part.trimmed(); + if (!trimmed.isEmpty()) { + result.append(trimmed); + } + } + return result; +} + +QString KeywordExtractor::name() const +{ + return QStringLiteral("keyword"); +} + +DFM_SEARCH_END_NS diff --git a/src/dfm-search/dfm-search-lib/semantic/extractors/keywordextractor.h b/src/dfm-search/dfm-search-lib/semantic/extractors/keywordextractor.h new file mode 100644 index 00000000..07e0e935 --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/extractors/keywordextractor.h @@ -0,0 +1,34 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef KEYWORDEXTRACTOR_H +#define KEYWORDEXTRACTOR_H + +#include + +DFM_SEARCH_BEGIN_NS + +class SemanticRuleEngine; + +class KeywordExtractor : public DimensionExtractor +{ +public: + explicit KeywordExtractor(SemanticRuleEngine *engine); + ~KeywordExtractor() override; + + void extract(const QString &input, ParsedIntent &intent) override; + QString name() const override; + +private: + bool extractStructuredKeywords(const QString &input, ParsedIntent &intent); + void extractUnconsumedText(const QString &input, ParsedIntent &intent); + QString extractUnconsumedRegions(const QString &input, const QList &allSpans) const; + static QStringList splitMultiKeywords(const QString &text, const QVariantMap &metadata); + + SemanticRuleEngine *m_engine; +}; + +DFM_SEARCH_END_NS + +#endif // KEYWORDEXTRACTOR_H diff --git a/src/dfm-search/dfm-search-lib/semantic/extractors/locationextractor.cpp b/src/dfm-search/dfm-search-lib/semantic/extractors/locationextractor.cpp new file mode 100644 index 00000000..ef129636 --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/extractors/locationextractor.cpp @@ -0,0 +1,91 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "locationextractor.h" + +#include "semantic/semanticruleengine.h" + +#include +#include + +DFM_SEARCH_BEGIN_NS + +LocationExtractor::LocationExtractor(SemanticRuleEngine *engine) + : m_engine(engine) +{ +} + +LocationExtractor::~LocationExtractor() = default; + +void LocationExtractor::extract(const QString &input, ParsedIntent &intent) +{ + if (!m_engine->hasGroup("location")) { + return; + } + + // Use matchAll to support multiple directory mentions (e.g., "桌面和下载的图片") + QStringList ruleIds; + const QList matches = m_engine->matchAll("location", input, &ruleIds); + + for (int i = 0; i < matches.size(); ++i) { + const QRegularExpressionMatch &m = matches[i]; + const QVariantMap metadata = m_engine->ruleMetadata("location", ruleIds[i]); + + const QString xdgType = metadata.value("xdg_type").toString(); + const bool includeHidden = metadata.value("include_hidden", false).toBool(); + + const QString path = resolveXdgPath(xdgType); + if (path.isEmpty()) { + continue; + } + + if (!intent.searchDirectories.contains(path)) { + intent.searchDirectories.append(path); + } + + if (includeHidden) { + intent.includeHidden = true; + } + + MatchSpan span; + span.start = m.capturedStart(); + span.end = m.capturedEnd(); + span.ruleId = ruleIds[i]; + intent.consumedSpans.append(span); + } +} + +QString LocationExtractor::resolveXdgPath(const QString &xdgType) +{ + if (xdgType == QLatin1String("desktop")) { + return QStandardPaths::writableLocation(QStandardPaths::DesktopLocation); + } + if (xdgType == QLatin1String("download")) { + return QStandardPaths::writableLocation(QStandardPaths::DownloadLocation); + } + if (xdgType == QLatin1String("documents")) { + return QStandardPaths::writableLocation(QStandardPaths::DocumentsLocation); + } + if (xdgType == QLatin1String("pictures")) { + return QStandardPaths::writableLocation(QStandardPaths::PicturesLocation); + } + if (xdgType == QLatin1String("music")) { + return QStandardPaths::writableLocation(QStandardPaths::MusicLocation); + } + if (xdgType == QLatin1String("movies")) { + return QStandardPaths::writableLocation(QStandardPaths::MoviesLocation); + } + if (xdgType == QLatin1String("trash")) { + return QDir::homePath() + QLatin1String("/.local/share/Trash/files"); + } + + return {}; +} + +QString LocationExtractor::name() const +{ + return QStringLiteral("location"); +} + +DFM_SEARCH_END_NS diff --git a/src/dfm-search/dfm-search-lib/semantic/extractors/locationextractor.h b/src/dfm-search/dfm-search-lib/semantic/extractors/locationextractor.h new file mode 100644 index 00000000..cd1104c3 --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/extractors/locationextractor.h @@ -0,0 +1,37 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef LOCATIONEXTRACTOR_H +#define LOCATIONEXTRACTOR_H + +#include + +DFM_SEARCH_BEGIN_NS + +class SemanticRuleEngine; + +class LocationExtractor : public DimensionExtractor +{ +public: + explicit LocationExtractor(SemanticRuleEngine *engine); + ~LocationExtractor() override; + + void extract(const QString &input, ParsedIntent &intent) override; + QString name() const override; + + /** + * @brief Resolve an XDG type string to an absolute filesystem path. + * @param xdgType One of: "desktop", "download", "documents", "pictures", + * "music", "movies", "trash" + * @return The resolved absolute path, or empty string if unknown + */ + static QString resolveXdgPath(const QString &xdgType); + +private: + SemanticRuleEngine *m_engine; +}; + +DFM_SEARCH_END_NS + +#endif // LOCATIONEXTRACTOR_H diff --git a/src/dfm-search/dfm-search-lib/semantic/extractors/sizeextractor.cpp b/src/dfm-search/dfm-search-lib/semantic/extractors/sizeextractor.cpp new file mode 100644 index 00000000..87a1dc72 --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/extractors/sizeextractor.cpp @@ -0,0 +1,139 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "sizeextractor.h" + +#include "semantic/semanticruleengine.h" + +#include + +DFM_SEARCH_BEGIN_NS + +SizeExtractor::SizeExtractor(SemanticRuleEngine *engine) + : m_engine(engine) +{ +} + +SizeExtractor::~SizeExtractor() = default; + +void SizeExtractor::extract(const QString &input, ParsedIntent &intent) +{ + if (!m_engine->hasGroup("size")) { + return; + } + + QString ruleId; + QRegularExpressionMatch match; + if (!m_engine->match("size", input, match, &ruleId)) { + return; + } + + const QVariantMap metadata = m_engine->ruleMetadata("size", ruleId); + const QString typeStr = metadata.value("type").toString(); + SizeConstraint sc; + + if (typeStr == "preset") { + sc.minSize = metadata.value("min_bytes", 0).toLongLong(); + sc.maxSize = metadata.value("max_bytes", 0).toLongLong(); + if (metadata.contains("include_upper")) { + sc.includeUpper = metadata.value("include_upper").toBool(); + } + if (metadata.contains("include_lower")) { + sc.includeLower = metadata.value("include_lower").toBool(); + } + } else if (typeStr == "dynamic") { + const QString direction = metadata.value("direction").toString(); + + if (direction == "min") { + const QString value = match.captured("value"); + const QString unit = normalizeUnit(match.captured("unit"), metadata); + qint64 bytes = parseSizeToBytes(value, unit); + if (bytes <= 0) { + return; + } + sc.minSize = bytes; + sc.includeLower = true; + } else if (direction == "max") { + const QString value = match.captured("value"); + const QString unit = normalizeUnit(match.captured("unit"), metadata); + qint64 bytes = parseSizeToBytes(value, unit); + if (bytes <= 0) { + return; + } + sc.maxSize = bytes; + sc.includeUpper = true; + } else if (direction == "range") { + const QString minVal = match.captured("min_val"); + const QString minUnit = normalizeUnit(match.captured("min_unit"), metadata); + const QString maxVal = match.captured("max_val"); + const QString maxUnit = normalizeUnit(match.captured("max_unit"), metadata); + qint64 minBytes = parseSizeToBytes(minVal, minUnit); + qint64 maxBytes = parseSizeToBytes(maxVal, maxUnit); + if (minBytes <= 0 || maxBytes <= 0) { + return; + } + sc.minSize = minBytes; + sc.maxSize = maxBytes; + } + } + + if (sc.isValid()) { + intent.sizeConstraint = sc; + MatchSpan span; + span.start = match.capturedStart(); + span.end = match.capturedEnd(); + span.ruleId = ruleId; + intent.consumedSpans.append(span); + } +} + +QString SizeExtractor::normalizeUnit(const QString &rawUnit, const QVariantMap &metadata) +{ + if (rawUnit.isEmpty()) { + return {}; + } + + const QVariantMap unitMap = metadata.value("unit_map").toMap(); + if (!unitMap.isEmpty()) { + const QString mapped = unitMap.value(rawUnit).toString(); + if (!mapped.isEmpty()) { + return mapped; + } + } + + return rawUnit; +} + +qint64 SizeExtractor::parseSizeToBytes(const QString &value, const QString &unit) +{ + bool ok = false; + double num = value.toDouble(&ok); + if (!ok || num <= 0) { + return -1; + } + + const QString u = unit.toUpper(); + if (u.isEmpty() || u == "B" || u == "BB") { + return static_cast(num); + } + if (u == "K" || u == "KB") { + return static_cast(num * 1024); + } + if (u == "M" || u == "MB") { + return static_cast(num * 1024 * 1024); + } + if (u == "G" || u == "GB") { + return static_cast(num * 1024 * 1024 * 1024); + } + + qWarning() << "Unknown size unit:" << unit; + return -1; +} + +QString SizeExtractor::name() const +{ + return QStringLiteral("size"); +} + +DFM_SEARCH_END_NS diff --git a/src/dfm-search/dfm-search-lib/semantic/extractors/sizeextractor.h b/src/dfm-search/dfm-search-lib/semantic/extractors/sizeextractor.h new file mode 100644 index 00000000..2946d3a7 --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/extractors/sizeextractor.h @@ -0,0 +1,31 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef SIZEEXTRACTOR_H +#define SIZEEXTRACTOR_H + +#include + +DFM_SEARCH_BEGIN_NS + +class SemanticRuleEngine; + +class SizeExtractor : public DimensionExtractor +{ +public: + explicit SizeExtractor(SemanticRuleEngine *engine); + ~SizeExtractor() override; + + void extract(const QString &input, ParsedIntent &intent) override; + QString name() const override; + +private: + static qint64 parseSizeToBytes(const QString &value, const QString &unit); + static QString normalizeUnit(const QString &rawUnit, const QVariantMap &metadata); + SemanticRuleEngine *m_engine; +}; + +DFM_SEARCH_END_NS + +#endif // SIZEEXTRACTOR_H diff --git a/src/dfm-search/dfm-search-lib/semantic/extractors/timeextractor.cpp b/src/dfm-search/dfm-search-lib/semantic/extractors/timeextractor.cpp new file mode 100644 index 00000000..7a795cad --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/extractors/timeextractor.cpp @@ -0,0 +1,312 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "timeextractor.h" + +#include "semantic/semanticruleengine.h" + +#include +#include +#include + +DFM_SEARCH_BEGIN_NS + +TimeExtractor::TimeExtractor(SemanticRuleEngine *engine) + : m_engine(engine) +{ +} + +TimeExtractor::~TimeExtractor() = default; + +void TimeExtractor::extract(const QString &input, ParsedIntent &intent) +{ + if (!m_engine->hasGroup("time")) { + return; + } + + QString ruleId; + QRegularExpressionMatch match; + if (!m_engine->match("time", input, match, &ruleId)) { + return; + } + + const QVariantMap metadata = m_engine->ruleMetadata("time", ruleId); + const QString typeStr = metadata.value("type").toString(); + TimeConstraint tc; + + if (typeStr == "preset") { + const QString presetStr = metadata.value("preset").toString(); + static const QMap kPresetMap = { + { "today", TimePreset::Today }, + { "yesterday", TimePreset::Yesterday }, + { "day_before_yesterday", TimePreset::DayBeforeYesterday }, + { "this_week", TimePreset::ThisWeek }, + { "last_week", TimePreset::LastWeek }, + { "this_month", TimePreset::ThisMonth }, + { "last_month", TimePreset::LastMonth }, + { "this_year", TimePreset::ThisYear }, + { "last_year", TimePreset::LastYear }, + }; + + if (kPresetMap.contains(presetStr)) { + tc.kind = TimeConstraintKind::Preset; + tc.preset = kPresetMap.value(presetStr); + } + } else if (typeStr == "custom") { + parseCustomTime(match, metadata, tc); + } else if (typeStr == "relative") { + parseRelativeTime(metadata, tc); + } else if (typeStr == "relative_dynamic") { + parseDynamicRelativeTime(match, metadata, tc); + } + + if (tc.isValid()) { + intent.timeConstraint = tc; + MatchSpan span; + span.start = match.capturedStart(); + span.end = match.capturedEnd(); + span.ruleId = ruleId; + intent.consumedSpans.append(span); + } +} + +void TimeExtractor::parseCustomTime(const QRegularExpressionMatch &match, + const QVariantMap &metadata, + TimeConstraint &tc) +{ + Q_UNUSED(metadata); + + auto tryCapture = [&match](const QString &name) -> QString { + const QString val = match.captured(name); + return val.isNull() ? QString() : val; + }; + + // Load locale-aware number conversion from rule metadata + const QMap digitMap = mapFromVariant(metadata.value("digit_map")); + const QString tensUnit = metadata.value("tens_unit").toString(); + + const QDate today = QDate::currentDate(); + int year = today.year(); + int month = 0; + int day = 0; + + { + const QString yearStr = tryCapture("year"); + if (!yearStr.isEmpty()) { + year = localeAwareToInt(yearStr, digitMap, tensUnit); + if (year <= 0) { + qWarning() << "Invalid year:" << yearStr; + return; + } + if (year < 100) { + year += 2000; + } + } + } + + { + const QString monthStr = tryCapture("month"); + if (!monthStr.isEmpty()) { + month = localeAwareToInt(monthStr, digitMap, tensUnit); + if (month < 1 || month > 12) { + qWarning() << "Invalid month:" << monthStr; + return; + } + } + } + + { + const QString dayStr = tryCapture("day"); + if (!dayStr.isEmpty()) { + day = localeAwareToInt(dayStr, digitMap, tensUnit); + if (day < 1 || day > 31) { + qWarning() << "Invalid day:" << dayStr; + return; + } + } + } + + // Validate date + if (month > 0 && day > 0) { + QDate date(year, month, day); + if (!date.isValid()) { + qWarning() << "Invalid date:" << year << month << day; + return; + } + tc.kind = TimeConstraintKind::Custom; + tc.customStart = QDateTime(date, QTime(0, 0, 0)); + tc.customEnd = QDateTime(date, QTime(23, 59, 59)); + } else if (month > 0 && day == 0) { + // Year-month only: entire month + QDate monthStart(year, month, 1); + QDate monthEnd(year, month, monthStart.daysInMonth()); + tc.kind = TimeConstraintKind::Custom; + tc.customStart = QDateTime(monthStart, QTime(0, 0, 0)); + tc.customEnd = QDateTime(monthEnd, QTime(23, 59, 59)); + } else if (month == 0) { + // Year only: entire year + tc.kind = TimeConstraintKind::Custom; + tc.customStart = QDateTime(QDate(year, 1, 1), QTime(0, 0, 0)); + tc.customEnd = QDateTime(QDate(year, 12, 31), QTime(23, 59, 59)); + } +} + +void TimeExtractor::parseRelativeTime(const QVariantMap &metadata, TimeConstraint &tc) +{ + const QDateTime now = QDateTime::currentDateTime(); + const int agoEndSecs = metadata.value("ago_end_seconds", 0).toInt(); + const int agoStartSecs = metadata.value("ago_start_seconds", 0).toInt(); + + tc.kind = TimeConstraintKind::Relative; + tc.customEnd = now.addSecs(-agoEndSecs); + + if (agoStartSecs < 0) { + // Sentinel: "from epoch" + tc.customStart = QDateTime::fromMSecsSinceEpoch(0); + } else { + tc.customStart = now.addSecs(-agoStartSecs); + } +} + +void TimeExtractor::parseDynamicRelativeTime(const QRegularExpressionMatch &match, + const QVariantMap &metadata, + TimeConstraint &tc) +{ + // Load locale-aware number conversion from rule metadata + const QMap digitMap = mapFromVariant(metadata.value("digit_map")); + const QString tensUnit = metadata.value("tens_unit").toString(); + + // Extract numeric value from any of the named capture groups (value, value2, value3, value4) + int value = 0; + const QStringList captureNames = { QStringLiteral("value"), QStringLiteral("value2"), + QStringLiteral("value3"), QStringLiteral("value4") }; + for (const QString &name : captureNames) { + const QString captured = match.captured(name); + if (!captured.isNull()) { + value = localeAwareToInt(captured, digitMap, tensUnit); + if (value > 0) { + break; + } + value = 0; + } + } + + if (value <= 0 || value > 3650) { + return; + } + + // Determine unit from metadata or capture + const QString unitStr = metadata.value("default_unit").toString(); + TimeUnit unit = TimeUnit::Days; + if (unitStr == "hours") { + unit = TimeUnit::Hours; + } else if (unitStr == "weeks") { + unit = TimeUnit::Weeks; + } else if (unitStr == "months") { + unit = TimeUnit::Months; + } + + const QDateTime now = QDateTime::currentDateTime(); + qint64 totalSeconds = 0; + + switch (unit) { + case TimeUnit::Minutes: + totalSeconds = static_cast(value) * 60; + break; + case TimeUnit::Hours: + totalSeconds = static_cast(value) * 3600; + break; + case TimeUnit::Days: + totalSeconds = static_cast(value) * 86400; + break; + case TimeUnit::Weeks: + totalSeconds = static_cast(value) * 7 * 86400; + break; + case TimeUnit::Months: { + // Approximate: use average days per month + const QDate startDate = now.addMonths(-value).date(); + tc.kind = TimeConstraintKind::Relative; + tc.customStart = QDateTime(startDate, QTime(0, 0, 0)); + tc.customEnd = now; + tc.relativeValue = value; + tc.relativeUnit = unit; + return; + } + case TimeUnit::Years: + totalSeconds = static_cast(value) * 365 * 86400; + break; + } + + tc.kind = TimeConstraintKind::Relative; + tc.customStart = now.addSecs(-totalSeconds); + tc.customEnd = now; + tc.relativeValue = value; + tc.relativeUnit = unit; +} + +int TimeExtractor::localeAwareToInt(const QString &input, + const QMap &digitMap, + const QString &tensUnit) +{ + if (input.isEmpty()) { + return -1; + } + + // Try direct integer conversion first (handles Arabic numerals) + bool ok = false; + int directValue = input.toInt(&ok); + if (ok) { + return directValue; + } + + // Single digit character from digit_map + if (input.size() == 1 && digitMap.contains(input)) { + return digitMap.value(input); + } + + // No digit_map or tens_unit configured — cannot parse locale-specific numbers + if (digitMap.isEmpty() || tensUnit.isEmpty()) { + return -1; + } + + // Two-character pattern: "XY" where Y is the tens unit (e.g., "十五" = 15) + if (input.size() == 2 && input.mid(1) == tensUnit) { + int prefix = digitMap.value(input.left(1), -1); + if (prefix > 1) { + return prefix * 10; + } + // "十" alone = 10 + if (prefix == -1 && input.left(1) == tensUnit) { + return 10; + } + } + + // Three-character pattern: "X Y Z" where Y is tens unit (e.g., "二十五" = 25) + if (input.size() == 3 && input.mid(1) == tensUnit) { + int prefix = digitMap.value(input.left(1), -1); + int suffix = digitMap.value(input.right(1), 0); + if (prefix > 0) { + return prefix * 10 + suffix; + } + } + + return -1; +} + +QMap TimeExtractor::mapFromVariant(const QVariant &variant) +{ + QMap result; + const QVariantMap map = variant.toMap(); + for (auto it = map.constBegin(); it != map.constEnd(); ++it) { + result.insert(it.key(), it.value().toInt()); + } + return result; +} + +QString TimeExtractor::name() const +{ + return QStringLiteral("time"); +} + +DFM_SEARCH_END_NS diff --git a/src/dfm-search/dfm-search-lib/semantic/extractors/timeextractor.h b/src/dfm-search/dfm-search-lib/semantic/extractors/timeextractor.h new file mode 100644 index 00000000..c99095bf --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/extractors/timeextractor.h @@ -0,0 +1,54 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef TIMEEXTRACTOR_H +#define TIMEEXTRACTOR_H + +#include + +#include + +DFM_SEARCH_BEGIN_NS + +class SemanticRuleEngine; + +class TimeExtractor : public DimensionExtractor +{ +public: + explicit TimeExtractor(SemanticRuleEngine *engine); + ~TimeExtractor() override; + + void extract(const QString &input, ParsedIntent &intent) override; + QString name() const override; + +private: + void parseCustomTime(const QRegularExpressionMatch &match, const QVariantMap &metadata, TimeConstraint &tc); + void parseRelativeTime(const QVariantMap &metadata, TimeConstraint &tc); + void parseDynamicRelativeTime(const QRegularExpressionMatch &match, const QVariantMap &metadata, TimeConstraint &tc); + + /** + * @brief Convert a string to int using locale-aware digit mapping. + * + * First tries direct integer conversion (Arabic numerals). + * Falls back to digit_map lookup and positional tens-unit parsing. + * @param input The string to convert + * @param digitMap Mapping of locale-specific digit characters to integers (from rule metadata) + * @param tensUnit The character representing the tens place (from rule metadata, e.g. "十") + * @return The integer value, or -1 if conversion fails + */ + static int localeAwareToInt(const QString &input, + const QMap &digitMap, + const QString &tensUnit); + + /** + * @brief Convert a QVariantMap (from JSON) to a QMap. + */ + static QMap mapFromVariant(const QVariant &variant); + + SemanticRuleEngine *m_engine; +}; + +DFM_SEARCH_END_NS + +#endif // TIMEEXTRACTOR_H diff --git a/src/dfm-search/dfm-search-lib/semantic/intentparser.cpp b/src/dfm-search/dfm-search-lib/semantic/intentparser.cpp new file mode 100644 index 00000000..fdcd9bc3 --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/intentparser.cpp @@ -0,0 +1,58 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "intentparser.h" + +#include "semanticruleengine.h" +#include "extractors/actionextractor.h" +#include "extractors/filetypeextractor.h" +#include "extractors/keywordextractor.h" +#include "extractors/locationextractor.h" +#include "extractors/sizeextractor.h" +#include "extractors/timeextractor.h" + +DFM_SEARCH_BEGIN_NS + +IntentParser::IntentParser(SemanticRuleEngine *engine) + : m_engine(engine) +{ + initDefaultExtractors(); +} + +IntentParser::~IntentParser() = default; + +void IntentParser::parse(const QString &input, ParsedIntent &intent) +{ + for (DimensionExtractor *extractor : m_extractors) { + extractor->extract(input, intent); + } +} + +void IntentParser::addExtractor(std::unique_ptr extractor) +{ + m_extractors.push_back(extractor.get()); + m_extractorOwners.push_back(std::move(extractor)); +} + +QStringList IntentParser::extractorNames() const +{ + QStringList names; + for (DimensionExtractor *e : m_extractors) { + names.append(e->name()); + } + return names; +} + +void IntentParser::initDefaultExtractors() +{ + // Order matters: keyword MUST be last (depends on consumedSpans) + addExtractor(std::make_unique(m_engine)); + addExtractor(std::make_unique(m_engine)); + addExtractor(std::make_unique(m_engine)); + addExtractor(std::make_unique(m_engine)); + addExtractor(std::make_unique(m_engine)); + addExtractor(std::make_unique(m_engine)); +} + +DFM_SEARCH_END_NS diff --git a/src/dfm-search/dfm-search-lib/semantic/intentparser.h b/src/dfm-search/dfm-search-lib/semantic/intentparser.h new file mode 100644 index 00000000..39cab098 --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/intentparser.h @@ -0,0 +1,59 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef INTENTPARSER_H +#define INTENTPARSER_H + +#include +#include +#include + +#include +#include + +DFM_SEARCH_BEGIN_NS + +class SemanticRuleEngine; + +/** + * @brief Orchestrates dimension extractors to parse natural language into intent. + * + * Extractors run in order. KeywordExtractor MUST be last + * because it relies on consumedSpans from earlier extractors. + */ +class IntentParser +{ +public: + explicit IntentParser(SemanticRuleEngine *engine); + ~IntentParser(); + + /** + * @brief Parse natural language input into a structured intent. + * @param input The raw natural language string + * @param intent Output: parsed intent + */ + void parse(const QString &input, ParsedIntent &intent); + + /** + * @brief Add a custom dimension extractor. + * Extractors are called in the order they are added. + */ + void addExtractor(std::unique_ptr extractor); + + /** + * @brief Get the list of extractor names. + */ + QStringList extractorNames() const; + +private: + void initDefaultExtractors(); + + SemanticRuleEngine *m_engine; + std::vector m_extractors; + std::vector> m_extractorOwners; +}; + +DFM_SEARCH_END_NS + +#endif // INTENTPARSER_H diff --git a/src/dfm-search/dfm-search-lib/semantic/ruleconfigloader.cpp b/src/dfm-search/dfm-search-lib/semantic/ruleconfigloader.cpp new file mode 100644 index 00000000..e7372d42 --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/ruleconfigloader.cpp @@ -0,0 +1,181 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "ruleconfigloader.h" +#include "semanticruleengine.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +DFM_SEARCH_BEGIN_NS + +namespace { +#ifdef CMAKE_INSTALL_PREFIX +constexpr auto kInstallPrefix = CMAKE_INSTALL_PREFIX; +#else +constexpr auto kInstallPrefix = "/usr"; +#endif + +constexpr auto kLibName = "dfm-search"; +} // namespace + +QString RuleConfigLoader::libName() +{ + return QLatin1String(kLibName); +} + +QString RuleConfigLoader::systemRulesDir() +{ + return QDir(QDir(QLatin1String(kInstallPrefix)) + .absoluteFilePath(QLatin1String("share/deepin/") + + libName() + + "/semantic/rules")) + .absolutePath(); +} + +QString RuleConfigLoader::userRulesDir() +{ + return QDir(QStandardPaths::writableLocation(QStandardPaths::GenericConfigLocation) + + "/deepin/" + + libName() + + "/semantic/rules") + .absolutePath(); +} + +QString RuleConfigLoader::currentLocaleName() +{ + return QLocale::system().name().simplified(); +} + +QStringList RuleConfigLoader::ruleFilePaths() +{ + QStringList paths; + QSet seen; // deduplicate by filename + + const QStringList dirs { resolveLocaleDir(userRulesDir()), + resolveLocaleDir(systemRulesDir()) }; + + for (const QString &dir : dirs) { + const QStringList files = QDir(dir).entryList( + QStringList { QStringLiteral("*.json") }, + QDir::Files | QDir::Readable); + for (const QString &filename : files) { + const QString absPath = QDir(dir).absoluteFilePath(filename); + if (!seen.contains(filename) && validateRuleFile(absPath)) { + paths.append(absPath); + seen.insert(filename); + } + } + } + + return paths; +} + +QString RuleConfigLoader::resolveLocaleDir(const QString &baseDir) +{ + const QString locale = currentLocaleName(); + + // Try full locale name (e.g., zh_CN) + const QString fullLocalePath = QDir(baseDir).absoluteFilePath(locale); + if (QDir(fullLocalePath).exists()) { + return fullLocalePath; + } + + // Try language only (e.g., zh from zh_CN) + const QString langOnly = locale.split(QLatin1Char('_')).value(0); + if (!langOnly.isEmpty() && langOnly != locale) { + const QString langOnlyPath = QDir(baseDir).absoluteFilePath(langOnly); + if (QDir(langOnlyPath).exists()) { + return langOnlyPath; + } + } + + // Fallback to default locale + return QDir(baseDir).absoluteFilePath(QLatin1String(kDefaultLocale)); +} + +QString RuleConfigLoader::resolveRulePath(const QString &filename) +{ + // User-local override with locale + const QString userLocaleDir = resolveLocaleDir(userRulesDir()); + const QString userPath = QDir(userLocaleDir).absoluteFilePath(filename); + if (QFile::exists(userPath) && validateRuleFile(userPath)) { + return userPath; + } + + // System rules with locale + const QString sysLocaleDir = resolveLocaleDir(systemRulesDir()); + const QString sysPath = QDir(sysLocaleDir).absoluteFilePath(filename); + if (QFile::exists(sysPath) && validateRuleFile(sysPath)) { + return sysPath; + } + + return {}; +} + +bool RuleConfigLoader::loadRuleFile(const QString &path, QList &groups) +{ + QFile file(path); + if (!file.open(QIODevice::ReadOnly)) { + qWarning() << "Failed to open rule file:" << path; + return false; + } + + QJsonParseError parseError; + const QJsonDocument doc = QJsonDocument::fromJson(file.readAll(), &parseError); + if (parseError.error != QJsonParseError::NoError) { + qWarning() << "JSON parse error in" << path << ":" << parseError.errorString(); + return false; + } + + const QJsonObject root = doc.object(); + const QJsonArray groupsArray = root.value("groups").toArray(); + + for (const QJsonValue &gv : groupsArray) { + RuleGroup group; + if (!SemanticRuleEngine::parseRuleGroupStatic(gv.toObject(), group)) { + qWarning() << "Invalid rule group in" << path; + continue; + } + groups.append(group); + } + + return !groups.isEmpty(); +} + +bool RuleConfigLoader::validateRuleFile(const QString &path) +{ + QFile file(path); + if (!file.open(QIODevice::ReadOnly)) { + return false; + } + + QJsonParseError parseError; + const QJsonDocument doc = QJsonDocument::fromJson(file.readAll(), &parseError); + if (parseError.error != QJsonParseError::NoError) { + return false; + } + + const QJsonObject root = doc.object(); + return root.contains("groups") && root.value("groups").isArray(); +} + +bool RuleConfigLoader::ensureUserRulesDir() +{ + const QString dir = userRulesDir(); + if (!QDir().mkpath(dir)) { + qWarning() << "Failed to create user rules directory:" << dir; + return false; + } + return true; +} + +DFM_SEARCH_END_NS diff --git a/src/dfm-search/dfm-search-lib/semantic/ruleconfigloader.h b/src/dfm-search/dfm-search-lib/semantic/ruleconfigloader.h new file mode 100644 index 00000000..d03bf3b9 --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/ruleconfigloader.h @@ -0,0 +1,104 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef RULECONFIGLOADER_H +#define RULECONFIGLOADER_H + +#include + +#include + +DFM_SEARCH_BEGIN_NS + +struct RuleGroup; + +/** + * @brief Loads semantic rule configuration files from system/user paths. + * + * Locale resolution: rules are organized under locale subdirectories + * (e.g., rules/zh_CN/, rules/en_US/). The loader resolves the locale + * at runtime using QLocale and falls back through a chain: + * zh_CN -> zh -> zh_CN (default) + * + * Priority: user-local config > system-installed config. + * System path: /usr/share/deepin/dfm-search/semantic/rules/ + * User path: ~/.config/deepin/dfm-search/semantic/rules/ + */ +class RuleConfigLoader +{ +public: + /** + * @brief Get the library name based on Qt version. + */ + static QString libName(); + + /** + * @brief Get the system-installed rules directory. + */ + static QString systemRulesDir(); + + /** + * @brief Get the user-local rules directory for overrides. + */ + static QString userRulesDir(); + + /** + * @brief Get the current locale name (e.g., "zh_CN"). + * Uses QLocale::system().name().simplified(). + */ + static QString currentLocaleName(); + + /** + * @brief Scan locale directories and return resolved paths for all rule files. + * Scans user dir first, then system dir; user files take priority. + * Falls back through locale chain: zh_CN -> zh -> zh_CN (default). + * @return Deduplicated list of resolved absolute paths + */ + static QStringList ruleFilePaths(); + + /** + * @brief Resolve the effective path for a single rule file. + * Checks user dir first, then system dir, with locale subdirectory lookup. + * Falls back to zh_CN if the current locale directory is not found. + * @param filename The rule file name (e.g., "time_rules.json") + * @return The resolved absolute path, or empty if not found + */ + static QString resolveRulePath(const QString &filename); + + /** + * @brief Load and parse a rule file into groups. + * @param path Absolute path to the JSON file + * @param groups Output: parsed rule groups + * @return true if file was loaded and valid + */ + static bool loadRuleFile(const QString &path, QList &groups); + + /** + * @brief Validate JSON structure of a rule file. + * @param path Absolute path to the JSON file + * @return true if valid + */ + static bool validateRuleFile(const QString &path); + + /** + * @brief Ensure user rules directory exists. + * @return true on success + */ + static bool ensureUserRulesDir(); + +private: + /** + * @brief Get the locale subdirectory name with fallback chain. + * Tries: full locale (zh_CN) -> language only (zh) -> default (zh_CN) + * @param baseDir The base rules directory + * @return The locale subdirectory path that exists, or baseDir/defaultLocale + */ + static QString resolveLocaleDir(const QString &baseDir); + + static constexpr const char *kDefaultLocale = "zh_CN"; +}; + +DFM_SEARCH_END_NS + +#endif // RULECONFIGLOADER_H diff --git a/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/action_rules.json b/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/action_rules.json new file mode 100644 index 00000000..3ed04442 --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/action_rules.json @@ -0,0 +1,32 @@ +{ + "version": "1.0.0", + "groups": [ + { + "name": "action", + "version": "1.0.0", + "locale": "zh-CN", + "rules": [ + { + "id": "action_create", + "pattern": "新建的|创建的|存下来的|保存的|新加的", + "description": "Action: created files (BirthTime)", + "enabled": true, + "priority": 200, + "metadata": { + "time_field": "birth" + } + }, + { + "id": "action_modify", + "pattern": "修改过的|编辑过的|改过的|写过的|更新的", + "description": "Action: modified files (ModifyTime)", + "enabled": true, + "priority": 200, + "metadata": { + "time_field": "modify" + } + } + ] + } + ] +} diff --git a/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/filetype_rules.json b/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/filetype_rules.json new file mode 100644 index 00000000..ac471fbf --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/filetype_rules.json @@ -0,0 +1,154 @@ +{ + "version": "1.0.0", + "groups": [ + { + "name": "filetype", + "version": "1.0.0", + "locale": "zh-CN", + "rules": [ + { + "id": "filetype_word", + "pattern": "word|doc|docx", + "description": "Word documents", + "enabled": true, + "priority": 200, + "metadata": { + "extensions": ["doc", "docx"], + "fileTypes": ["doc"] + } + }, + { + "id": "filetype_pdf", + "pattern": "pdf", + "description": "PDF documents", + "enabled": true, + "priority": 200, + "metadata": { + "extensions": ["pdf"] + } + }, + { + "id": "filetype_excel", + "pattern": "excel|xls|xlsx", + "description": "Excel spreadsheets", + "enabled": true, + "priority": 200, + "metadata": { + "extensions": ["xls", "xlsx"], + "fileTypes": ["doc"] + } + }, + { + "id": "filetype_ppt", + "pattern": "ppt|pptx", + "description": "PowerPoint presentations", + "enabled": true, + "priority": 200, + "metadata": { + "extensions": ["ppt", "pptx"] + } + }, + { + "id": "filetype_document_general", + "pattern": "文档|报告|文章|方案|文本|资料|笔记|稿件", + "description": "Generic documents (fallback)", + "enabled": true, + "priority": 100, + "metadata": { + "extensions": ["doc", "docx", "pdf", "txt", "wps", "rtf", "md", "odt"], + "fileTypes": ["doc"], + "general": true + } + }, + { + "id": "filetype_spreadsheet_general", + "pattern": "表格|统计表|报表|名单|数据表|明细", + "description": "Generic spreadsheets (fallback)", + "enabled": true, + "priority": 100, + "metadata": { + "extensions": ["xls", "xlsx", "csv", "ods", "et"], + "fileTypes": ["doc"], + "general": true + } + }, + { + "id": "filetype_presentation_general", + "pattern": "幻灯片|演示文稿|汇报|课件|宣讲", + "description": "Generic presentations (fallback)", + "enabled": true, + "priority": 100, + "metadata": { + "extensions": ["ppt", "pptx", "dps", "odp"], + "general": true + } + }, + { + "id": "filetype_image", + "pattern": "图片|照片|截图|壁纸|海报|相片|表情包|图", + "description": "Images", + "enabled": true, + "priority": 150, + "metadata": { + "extensions": ["jpg", "jpeg", "png", "gif", "bmp", "webp", "svg"], + "fileTypes": ["pic"] + } + }, + { + "id": "filetype_video", + "pattern": "视频|录像|电影|动画|短片|片子", + "description": "Videos", + "enabled": true, + "priority": 150, + "metadata": { + "extensions": ["mp4", "avi", "mkv", "mov", "flv", "wmv", "webm"], + "fileTypes": ["video"] + } + }, + { + "id": "filetype_audio", + "pattern": "音频|音乐|录音|歌|语音", + "description": "Audio files", + "enabled": true, + "priority": 150, + "metadata": { + "extensions": ["mp3", "wav", "flac", "aac", "ogg", "m4a"], + "fileTypes": ["audio"] + } + }, + { + "id": "filetype_archive", + "pattern": "压缩包|归档|源码包|打包文件|zip|rar", + "description": "Archive files", + "enabled": true, + "priority": 150, + "metadata": { + "extensions": ["zip", "tar.gz", "tar", "rar", "7z", "bz2"], + "fileTypes": ["archive"] + } + }, + { + "id": "filetype_application", + "pattern": "安装包|软件|应用|脚本|程序", + "description": "Application packages", + "enabled": true, + "priority": 150, + "metadata": { + "extensions": ["deb", "AppImage", "sh", "py", "bin", "run"], + "fileTypes": ["app"] + } + }, + { + "id": "filetype_design_source", + "pattern": "源文件|设计稿|矢量图|工程文件|psd|fig|sketch", + "description": "Design source files", + "enabled": true, + "priority": 150, + "metadata": { + "extensions": ["psd", "ai", "fig", "sketch"] + } + } + ] + } + ] +} diff --git a/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/keyword_rules.json b/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/keyword_rules.json new file mode 100644 index 00000000..6321765e --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/keyword_rules.json @@ -0,0 +1,68 @@ +{ + "version": "1.0.0", + "groups": [ + { + "name": "keyword", + "version": "1.0.0", + "locale": "zh-CN", + "rules": [ + { + "id": "keyword_contains", + "pattern": "包含(.+?)(?:的|$)", + "description": "Contains keyword pattern", + "enabled": true, + "priority": 200, + "metadata": { + "capture_group": 1, + "multi_keyword": true, + "search_target": "all", + "cleanup_pattern": "[\\s。、\u201c\u201d\uff0c\u3001\u7684\u4e86\u8981\u554a]+", + "split_pattern": "[\u548c\u4e0e\u4ee5\u53ca\u6216\u3001]+" + } + }, + { + "id": "keyword_named", + "pattern": "名为(.+?)(?:的|$)|叫做(.+?)(?:的|$)", + "description": "Named keyword pattern", + "enabled": true, + "priority": 200, + "metadata": { + "capture_group": 1, + "multi_keyword": false, + "search_target": "filename", + "cleanup_pattern": "[\\s。、\u201c\u201d\uff0c\u3001\u7684\u4e86\u8981\u554a]+", + "split_pattern": "[\u548c\u4e0e\u4ee5\u53ca\u6216\u3001]+" + } + }, + { + "id": "keyword_content_has", + "pattern": "内容(?:包含|含有|带有)(.+?)(?:的|$)", + "description": "Content contains keyword pattern", + "enabled": true, + "priority": 210, + "metadata": { + "capture_group": 1, + "multi_keyword": true, + "search_target": "content", + "cleanup_pattern": "[\\s。、\u201c\u201d\uff0c\u3001\u7684\u4e86\u8981\u554a]+", + "split_pattern": "[\u548c\u4e0e\u4ee5\u53ca\u6216\u3001]+" + } + }, + { + "id": "keyword_filename_has", + "pattern": "文件名(?:包含|含有|带有|是|为)(.+?)(?:的|$)", + "description": "Filename contains keyword pattern", + "enabled": true, + "priority": 210, + "metadata": { + "capture_group": 1, + "multi_keyword": true, + "search_target": "filename", + "cleanup_pattern": "[\\s。、\u201c\u201d\uff0c\u3001\u7684\u4e86\u8981\u554a]+", + "split_pattern": "[\u548c\u4e0e\u4ee5\u53ca\u6216\u3001]+" + } + } + ] + } + ] +} diff --git a/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/location_rules.json b/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/location_rules.json new file mode 100644 index 00000000..8ea723ac --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/location_rules.json @@ -0,0 +1,100 @@ +{ + "version": "1.0.0", + "groups": [ + { + "name": "location", + "version": "1.0.0", + "locale": "zh-CN", + "rules": [ + { + "id": "loc_desktop", + "pattern": "桌面", + "description": "Desktop directory", + "enabled": true, + "priority": 200, + "metadata": { + "xdg_type": "desktop", + "include_hidden": false + } + }, + { + "id": "loc_download", + "pattern": "下载", + "description": "Downloads directory", + "enabled": true, + "priority": 200, + "metadata": { + "xdg_type": "download", + "include_hidden": false + } + }, + { + "id": "loc_documents_dir", + "pattern": "文档目录|文档文件夹", + "description": "Documents directory (must contain directory/folder word)", + "enabled": true, + "priority": 200, + "metadata": { + "xdg_type": "documents", + "include_hidden": false + } + }, + { + "id": "loc_pictures_dir", + "pattern": "图片目录|图片文件夹|照片目录|照片文件夹", + "description": "Pictures directory (must contain directory/folder word)", + "enabled": true, + "priority": 200, + "metadata": { + "xdg_type": "pictures", + "include_hidden": false + } + }, + { + "id": "loc_music_dir", + "pattern": "音乐目录|音乐文件夹", + "description": "Music directory (must contain directory/folder word)", + "enabled": true, + "priority": 200, + "metadata": { + "xdg_type": "music", + "include_hidden": false + } + }, + { + "id": "loc_videos_dir", + "pattern": "视频目录|视频文件夹|电影目录|电影文件夹", + "description": "Videos directory (must contain directory/folder word)", + "enabled": true, + "priority": 200, + "metadata": { + "xdg_type": "movies", + "include_hidden": false + } + }, + { + "id": "loc_trash", + "pattern": "回收站|垃圾箱", + "description": "Trash directory", + "enabled": true, + "priority": 200, + "metadata": { + "xdg_type": "trash", + "include_hidden": true + } + }, + { + "id": "loc_deleted", + "pattern": "删除的|删除掉的", + "description": "Deleted files (maps to trash)", + "enabled": true, + "priority": 200, + "metadata": { + "xdg_type": "trash", + "include_hidden": true + } + } + ] + } + ] +} diff --git a/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/noise_rules.json b/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/noise_rules.json new file mode 100644 index 00000000..c469a2e2 --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/noise_rules.json @@ -0,0 +1,60 @@ +{ + "version": "1.0.0", + "groups": [ + { + "name": "noise", + "version": "1.0.0", + "locale": "zh-CN", + "rules": [ + { + "id": "noise_action", + "pattern": "帮我找到|帮我搜|帮我找|搜索|查找|找一下|搜一下|查一下|找下|搜下|查下|找找|搜搜", + "description": "Search action words to consume", + "enabled": true, + "priority": 100, + "metadata": {} + }, + { + "id": "noise_suffix", + "pattern": "的文件|的图片|的文档|的视频|的音频|的照片|的音乐|的压缩包|的安装包", + "description": "Trailing suffix words to consume", + "enabled": true, + "priority": 90, + "metadata": {} + }, + { + "id": "noise_size_lead", + "pattern": "大小在|大小为|大小是|体积在|体积为|体积是|容量在|容量为|容量是|大小不超过|大小超过|大小不到|大小最多|大小最少|大小至少", + "description": "Size constraint lead-in words to consume", + "enabled": true, + "priority": 140, + "metadata": {} + }, + { + "id": "noise_polite", + "pattern": "请|麻烦|谢谢|帮我", + "description": "Polite words to consume", + "enabled": true, + "priority": 80, + "metadata": {} + }, + { + "id": "noise_generic", + "pattern": "里面|关于|含有|带有|和|与|以及|或者|或", + "description": "Generic filler words to consume", + "enabled": true, + "priority": 70, + "metadata": {} + }, + { + "id": "noise_location_connector", + "pattern": "上的|里的|下的", + "description": "Location connector words to consume", + "enabled": true, + "priority": 190, + "metadata": {} + } + ] + } + ] +} diff --git a/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/size_rules.json b/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/size_rules.json new file mode 100644 index 00000000..21269c15 --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/size_rules.json @@ -0,0 +1,97 @@ +{ + "version": "1.0.0", + "groups": [ + { + "name": "size", + "version": "1.0.0", + "locale": "zh-CN", + "rules": [ + { + "id": "size_fuzzy_large", + "pattern": "大文件|很大的|占空间的|几个G的|几个g的", + "description": "Fuzzy large files (>500MB)", + "enabled": true, + "priority": 200, + "metadata": { + "type": "preset", + "min_bytes": 524288000, + "max_bytes": 0 + } + }, + { + "id": "size_small", + "pattern": "小文件|很小的| tiny的", + "description": "Small files (<1MB)", + "enabled": true, + "priority": 200, + "metadata": { + "type": "preset", + "min_bytes": 0, + "max_bytes": 1048576, + "include_upper": false + } + }, + { + "id": "size_dynamic_min_suffix", + "pattern": "(?\\d+(?:\\.\\d+)?)\\s*(?[KkMmGg][Bb]?|兆|字节|千|万)\\s*以上", + "description": "Dynamic size min with suffix only (e.g. 10M以上, 1G以上, 500K以上)", + "enabled": true, + "priority": 160, + "metadata": { + "type": "dynamic", + "direction": "min", + "unit_map": {"兆": "M", "字节": "B", "千": "K", "万": "K"} + } + }, + { + "id": "size_dynamic_max_suffix", + "pattern": "(?\\d+(?:\\.\\d+)?)\\s*(?[KkMmGg][Bb]?|兆|字节|千|万)\\s*(?:以内|以下)", + "description": "Dynamic size max with suffix only (e.g. 10M以内, 1G以下, 500K以内)", + "enabled": true, + "priority": 160, + "metadata": { + "type": "dynamic", + "direction": "max", + "unit_map": {"兆": "M", "字节": "B", "千": "K", "万": "K"} + } + }, + { + "id": "size_dynamic", + "pattern": "(大于|超过|最少|至少|>)\\s*(?\\d+(?:\\.\\d+)?)\\s*(?[KkMmGg][Bb]?|兆|字节|千|万)?以?[上内]?", + "description": "Dynamic precise size min (e.g. 大于500M, 1G以上, 大于100兆)", + "enabled": true, + "priority": 150, + "metadata": { + "type": "dynamic", + "direction": "min", + "unit_map": {"兆": "M", "字节": "B", "千": "K", "万": "K"} + } + }, + { + "id": "size_dynamic_less", + "pattern": "(小于|不超过|不到|最多|<)\\s*(?\\d+(?:\\.\\d+)?)\\s*(?[KkMmGg][Bb]?|兆|字节|千|万)?以?[下内]?", + "description": "Dynamic precise size max (e.g. 小于100K, 不到1G, 小于100兆)", + "enabled": true, + "priority": 150, + "metadata": { + "type": "dynamic", + "direction": "max", + "unit_map": {"兆": "M", "字节": "B", "千": "K", "万": "K"} + } + }, + { + "id": "size_dynamic_between", + "pattern": "(?\\d+(?:\\.\\d+)?)\\s*(?[KkMmGg][Bb]?|兆|字节|千|万)?[\\s~\\-到至]+(?\\d+(?:\\.\\d+)?)\\s*(?[KkMmGg][Bb]?|兆|字节|千|万)?", + "description": "Size range (e.g. 1M-10M, 100K到500K, 1兆到10兆)", + "enabled": true, + "priority": 150, + "metadata": { + "type": "dynamic", + "direction": "range", + "unit_map": {"兆": "M", "字节": "B", "千": "K", "万": "K"} + } + } + ] + } + ] +} diff --git a/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/time_rules.json b/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/time_rules.json new file mode 100644 index 00000000..20f116eb --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/rules/zh_CN/time_rules.json @@ -0,0 +1,340 @@ +{ + "version": "1.0.0", + "groups": [ + { + "name": "time", + "version": "1.0.0", + "locale": "zh-CN", + "rules": [ + { + "id": "time_today", + "pattern": "今天|今日|今日份", + "description": "Today", + "enabled": true, + "priority": 200, + "metadata": { + "type": "preset", + "preset": "today" + } + }, + { + "id": "time_yesterday", + "pattern": "昨天上午|昨天下午|昨天晚上|昨天|昨日|昨晚", + "description": "Yesterday", + "enabled": true, + "priority": 200, + "metadata": { + "type": "preset", + "preset": "yesterday" + } + }, + { + "id": "time_day_before_yesterday", + "pattern": "前天", + "description": "Day before yesterday", + "enabled": true, + "priority": 200, + "metadata": { + "type": "preset", + "preset": "day_before_yesterday" + } + }, + { + "id": "time_this_week", + "pattern": "本周|这周|这个星期|这一个星期", + "description": "This week", + "enabled": true, + "priority": 190, + "metadata": { + "type": "preset", + "preset": "this_week" + } + }, + { + "id": "time_last_week", + "pattern": "上周|上个星期|上星期|上一个星期", + "description": "Last week", + "enabled": true, + "priority": 190, + "metadata": { + "type": "preset", + "preset": "last_week" + } + }, + { + "id": "time_this_month", + "pattern": "本月|这个月|当月", + "description": "This month", + "enabled": true, + "priority": 180, + "metadata": { + "type": "preset", + "preset": "this_month" + } + }, + { + "id": "time_last_month", + "pattern": "上个月|上月", + "description": "Last month", + "enabled": true, + "priority": 180, + "metadata": { + "type": "preset", + "preset": "last_month" + } + }, + { + "id": "time_this_year", + "pattern": "今年|本年|这年", + "description": "This year", + "enabled": true, + "priority": 170, + "metadata": { + "type": "preset", + "preset": "this_year" + } + }, + { + "id": "time_last_year", + "pattern": "去年|上一年", + "description": "Last year", + "enabled": true, + "priority": 170, + "metadata": { + "type": "preset", + "preset": "last_year" + } + }, + { + "id": "time_exact_year", + "pattern": "(?\\d{2,4})年", + "description": "Exact year (e.g. 2025年, 25年)", + "enabled": true, + "priority": 120, + "metadata": { + "type": "custom", + "format": "year", + "digit_map": { + "零": 0, + "一": 1, + "二": 2, + "三": 3, + "四": 4, + "五": 5, + "六": 6, + "七": 7, + "八": 8, + "九": 9, + "十": 10 + }, + "tens_unit": "十" + } + }, + { + "id": "time_exact_month_current_year", + "pattern": "(?\\d{1,2})月份?", + "description": "Month this year (e.g. 12月, 5月份)", + "enabled": true, + "priority": 120, + "metadata": { + "type": "custom", + "format": "month", + "digit_map": { + "零": 0, + "一": 1, + "二": 2, + "三": 3, + "四": 4, + "五": 5, + "六": 6, + "七": 7, + "八": 8, + "九": 9, + "十": 10 + }, + "tens_unit": "十" + } + }, + { + "id": "time_exact_year_month", + "pattern": "(?\\d{2,4})[年\\./\\-](?\\d{1,2})月?", + "description": "Exact year-month (e.g. 2025年12月, 2025-12)", + "enabled": true, + "priority": 140, + "metadata": { + "type": "custom", + "format": "year_month", + "digit_map": { + "零": 0, + "一": 1, + "二": 2, + "三": 3, + "四": 4, + "五": 5, + "六": 6, + "七": 7, + "八": 8, + "九": 9, + "十": 10 + }, + "tens_unit": "十" + } + }, + { + "id": "time_exact_date_current_year", + "pattern": "(?\\d{1,2})月(?\\d{1,2})[日号]", + "description": "Exact date this year (e.g. 12月5日, 3月8号)", + "enabled": true, + "priority": 140, + "metadata": { + "type": "custom", + "format": "date", + "digit_map": { + "零": 0, + "一": 1, + "二": 2, + "三": 3, + "四": 4, + "五": 5, + "六": 6, + "七": 7, + "八": 8, + "九": 9, + "十": 10 + }, + "tens_unit": "十" + } + }, + { + "id": "time_exact_full_date", + "pattern": "(?\\d{2,4})[年\\./\\-](?\\d{1,2})[月\\./\\-](?\\d{1,2})[日号]?", + "description": "Exact full date (e.g. 2025年12月5日, 2025-12-05)", + "enabled": true, + "priority": 160, + "metadata": { + "type": "custom", + "format": "full_date", + "digit_map": { + "零": 0, + "一": 1, + "二": 2, + "三": 3, + "四": 4, + "五": 5, + "六": 6, + "七": 7, + "八": 8, + "九": 9, + "十": 10 + }, + "tens_unit": "十" + } + }, + { + "id": "time_just_now", + "pattern": "这会儿|刚才|刚刚|刚", + "description": "Just now (last 2 hours)", + "enabled": true, + "priority": 80, + "metadata": { + "type": "relative", + "relative_id": "just_now", + "ago_start_seconds": 7200, + "ago_end_seconds": 0 + } + }, + { + "id": "time_recent_days", + "pattern": "这阵子|近期|最近|这几天", + "description": "Recent days (last 3 days)", + "enabled": true, + "priority": 80, + "metadata": { + "type": "relative", + "relative_id": "recent_days", + "ago_start_seconds": 259200, + "ago_end_seconds": 0 + } + }, + { + "id": "time_recent_dynamic_days", + "pattern": "最?近(?[一二两三四五六七八九十百千万\\d]+)\\s*(?:个)?天|过去(?[一二两三四五六七八九十百千万\\d]+)\\s*(?:个)?天|前(?[一二两三四五六七八九十百千万\\d]+)\\s*(?:个)?天|近(?[一二两三四五六七八九十百千万\\d]+)\\s*(?:个)?天", + "description": "Recent N days (e.g. 最近3天, 最近三天, 近1周, 过去七天)", + "enabled": true, + "priority": 150, + "metadata": { + "type": "relative_dynamic", + "default_unit": "days", + "digit_map": {"零": 0, "一": 1, "二": 2, "两": 2, "三": 3, "四": 4, "五": 5, "六": 6, "七": 7, "八": 8, "九": 9, "十": 10}, + "tens_unit": "十" + } + }, + { + "id": "time_recent_dynamic_hours", + "pattern": "最?近(?[一二两三四五六七八九十百千万\\d]+)\\s*(?:个)?小时|过去(?[一二两三四五六七八九十百千万\\d]+)\\s*(?:个)?小时|近(?[一二两三四五六七八九十百千万\\d]+)\\s*(?:个)?小时|前(?[一二两三四五六七八九十百千万\\d]+)\\s*(?:个)?小时", + "description": "Recent N hours (e.g. 最近2小时, 最近两小时, 近一小时)", + "enabled": true, + "priority": 150, + "metadata": { + "type": "relative_dynamic", + "default_unit": "hours", + "digit_map": {"零": 0, "一": 1, "二": 2, "两": 2, "三": 3, "四": 4, "五": 5, "六": 6, "七": 7, "八": 8, "九": 9, "十": 10}, + "tens_unit": "十" + } + }, + { + "id": "time_recent_dynamic_weeks", + "pattern": "最?近(?[一二两三四五六七八九十百千万\\d]+)\\s*(?:个)?周|过去(?[一二两三四五六七八九十百千万\\d]+)\\s*(?:个)?周|近(?[一二两三四五六七八九十百千万\\d]+)\\s*(?:个)?周|前(?[一二两三四五六七八九十百千万\\d]+)\\s*(?:个)?周", + "description": "Recent N weeks (e.g. 最近2周, 最近两周, 近一周)", + "enabled": true, + "priority": 150, + "metadata": { + "type": "relative_dynamic", + "default_unit": "weeks", + "digit_map": {"零": 0, "一": 1, "二": 2, "两": 2, "三": 3, "四": 4, "五": 5, "六": 6, "七": 7, "八": 8, "九": 9, "十": 10}, + "tens_unit": "十" + } + }, + { + "id": "time_recent_dynamic_months", + "pattern": "最?近(?[一二两三四五六七八九十百千万\\d]+)\\s*(?:个)?月|过去(?[一二两三四五六七八九十百千万\\d]+)\\s*(?:个)?月|近(?[一二两三四五六七八九十百千万\\d]+)\\s*(?:个)?月|前(?[一二两三四五六七八九十百千万\\d]+)\\s*(?:个)?月", + "description": "Recent N months (e.g. 最近3个月, 最近三个月, 近一月)", + "enabled": true, + "priority": 150, + "metadata": { + "type": "relative_dynamic", + "default_unit": "months", + "digit_map": {"零": 0, "一": 1, "二": 2, "两": 2, "三": 3, "四": 4, "五": 5, "六": 6, "七": 7, "八": 8, "九": 9, "十": 10}, + "tens_unit": "十" + } + }, + { + "id": "time_past_few_days", + "pattern": "那些天|之前几天|前几天", + "description": "Past few days (3-7 days ago)", + "enabled": true, + "priority": 80, + "metadata": { + "type": "relative", + "relative_id": "past_few_days", + "ago_start_seconds": 604800, + "ago_end_seconds": 259200 + } + }, + { + "id": "time_a_while_ago", + "pattern": "早些时候|以前|之前", + "description": "A while ago (beyond 30 days)", + "enabled": true, + "priority": 80, + "metadata": { + "type": "relative", + "relative_id": "a_while_ago", + "ago_start_seconds": -1, + "ago_end_seconds": 2592000 + } + } + ] + } + ] +} \ No newline at end of file diff --git a/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.cpp b/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.cpp new file mode 100644 index 00000000..1dc03b91 --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.cpp @@ -0,0 +1,233 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "semanticquerybuilder.h" + +#include +#include +#include +#include +#include + +DFM_SEARCH_BEGIN_NS + +SemanticQueryBuilder::SemanticQueryBuilder() = default; +SemanticQueryBuilder::~SemanticQueryBuilder() = default; + +SemanticSearchPlan SemanticQueryBuilder::build(const ParsedIntent &intent) +{ + SemanticSearchPlan plan; + + // Pass location info through to plan (searcher handles per-directory options) + plan.searchDirectories = intent.searchDirectories; + plan.includeHidden = intent.includeHidden; + + // Determine time field strategy + if (intent.timeConstraint.isValid() && intent.timeConstraint.timeField == TimeField::Unspecified) { + // Time constraint exists but no action specified → search both birth and modify time + plan.timeField = TimeField::Both; + } else if (intent.timeConstraint.timeField == TimeField::BirthTime) { + plan.timeField = TimeField::BirthTime; + } else if (intent.timeConstraint.timeField == TimeField::ModifyTime) { + plan.timeField = TimeField::ModifyTime; + } else { + plan.timeField = TimeField::ModifyTime; + } + + // Base options shared across all search paths + SearchOptions baseOpts = buildBaseOptions(intent.timeConstraint, intent.sizeConstraint); + baseOpts.setSearchMethod(SearchMethod::Indexed); + + // Determine which search paths to enable based on user intent + const bool enableFileName = (intent.searchTarget == SearchTarget::All + || intent.searchTarget == SearchTarget::FileNameOnly); + const bool enableContent = (intent.searchTarget == SearchTarget::All + || intent.searchTarget == SearchTarget::ContentOnly); + const bool enableOcr = enableContent; // OCR is a content search path + + // --- File name search --- + if (enableFileName) { + SearchOptions opts = baseOpts; + FileNameOptionsAPI fnameApi(opts); + + if (!intent.fileExtensions.isEmpty()) { + fnameApi.setFileExtensions(intent.fileExtensions); + } + + if (intent.keywords.size() == 1) { + plan.fileNameQuery = SearchFactory::createQuery(intent.keywords.first()); + } else if (intent.keywords.size() > 1) { + plan.fileNameQuery = SearchFactory::createQuery(intent.keywords, SearchQuery::Type::Boolean); + } else { + // No keywords: search all files (use wildcard to match everything) + plan.fileNameQuery = SearchFactory::createQuery(""); + } + + plan.fileNameOptions = opts; + } + + // --- Content search (when keywords available and content target enabled) --- + if (enableContent) { + const bool hasKeywords = !intent.keywords.isEmpty(); + bool contentEnabled = hasKeywords; + + // Check if content index is available + if (contentEnabled && !Global::isContentIndexAvailable()) { + contentEnabled = false; + } + + if (contentEnabled) { + // Check minimum keyword length + const int minLen = Global::kMinContentSearchKeywordLength; + bool hasValidKeyword = false; + for (const QString &kw : intent.keywords) { + if (kw.length() >= minLen) { + hasValidKeyword = true; + break; + } + } + if (!hasValidKeyword) { + contentEnabled = false; + } + } + + if (contentEnabled) { + SearchOptions opts = baseOpts; + ContentOptionsAPI contentApi(opts); + + // Enable filename-content mixed AND search + contentApi.setFilenameContentMixedAndSearchEnabled(true); + + if (intent.keywords.size() == 1) { + plan.contentQuery = SearchFactory::createQuery(intent.keywords.first()); + } else if (intent.keywords.size() > 1) { + plan.contentQuery = SearchFactory::createQuery(intent.keywords, SearchQuery::Type::Boolean); + } + + plan.contentOptions = opts; + } + } + + // --- OCR search (when keywords available and content target enabled) --- + if (enableOcr) { + const bool hasKeywords = !intent.keywords.isEmpty(); + bool ocrEnabled = hasKeywords; + + if (ocrEnabled && !Global::isOcrTextIndexAvailable()) { + ocrEnabled = false; + } + + if (ocrEnabled) { + SearchOptions opts = baseOpts; + OcrTextOptionsAPI ocrApi(opts); + + // Enable filename-OCR content mixed AND search + ocrApi.setFilenameOcrContentMixedAndSearchEnabled(true); + + if (intent.keywords.size() == 1) { + plan.ocrQuery = SearchFactory::createQuery(intent.keywords.first()); + } else if (intent.keywords.size() > 1) { + plan.ocrQuery = SearchFactory::createQuery(intent.keywords, SearchQuery::Type::Boolean); + } + + plan.ocrOptions = opts; + } + } + + return plan; +} + +TimeRangeFilter SemanticQueryBuilder::buildTimeRangeFilter(const TimeConstraint &tc) const +{ + TimeRangeFilter filter; + + if (!tc.isValid()) { + return filter; + } + + switch (tc.kind) { + case TimeConstraintKind::Preset: + switch (tc.preset) { + case TimePreset::Today: + filter.setToday(); + break; + case TimePreset::Yesterday: + filter.setYesterday(); + break; + case TimePreset::DayBeforeYesterday: { + const QDate today = QDate::currentDate(); + const QDate dayBefore = today.addDays(-2); + filter.setRange(QDateTime(dayBefore, QTime(0, 0, 0)), + QDateTime(dayBefore, QTime(23, 59, 59))); + break; + } + case TimePreset::ThisWeek: + filter.setThisWeek(); + break; + case TimePreset::LastWeek: + filter.setLastWeek(); + break; + case TimePreset::ThisMonth: + filter.setThisMonth(); + break; + case TimePreset::LastMonth: + filter.setLastMonth(); + break; + case TimePreset::ThisYear: + filter.setThisYear(); + break; + case TimePreset::LastYear: + filter.setLastYear(); + break; + } + break; + case TimeConstraintKind::Relative: + filter.setRange(tc.customStart, tc.customEnd); + break; + case TimeConstraintKind::Custom: + filter.setRange(tc.customStart, tc.customEnd); + break; + case TimeConstraintKind::None: + break; + } + + // Set time field on the filter + if (tc.timeField == TimeField::BirthTime || tc.timeField == TimeField::ModifyTime) { + filter.setTimeField(tc.timeField); + } else if (tc.timeField == TimeField::Unspecified || tc.timeField == TimeField::Both) { + // No specific time field or both requested → search both birth and modify time + filter.setTimeField(TimeField::Both); + } + + return filter; +} + +SizeRangeFilter SemanticQueryBuilder::buildSizeRangeFilter(const SizeConstraint &sc) const +{ + SizeRangeFilter filter; + if (!sc.isValid()) { + return filter; + } + filter.setMin(sc.minSize); + filter.setMax(sc.maxSize); + filter.setIncludeLower(sc.includeLower); + filter.setIncludeUpper(sc.includeUpper); + return filter; +} + +SearchOptions SemanticQueryBuilder::buildBaseOptions(const TimeConstraint &tc, const SizeConstraint &sc) const +{ + SearchOptions opts; + const TimeRangeFilter timeFilter = buildTimeRangeFilter(tc); + if (timeFilter.isValid()) { + opts.setTimeRangeFilter(timeFilter); + } + const SizeRangeFilter sizeFilter = buildSizeRangeFilter(sc); + if (sizeFilter.isValid()) { + opts.setSizeRangeFilter(sizeFilter); + } + return opts; +} + +DFM_SEARCH_END_NS diff --git a/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.h b/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.h new file mode 100644 index 00000000..824f252b --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/semanticquerybuilder.h @@ -0,0 +1,56 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef SEMANTICQUERYBUILDER_H +#define SEMANTICQUERYBUILDER_H + +#include +#include +#include +#include + +#include + +DFM_SEARCH_BEGIN_NS + +/** + * @brief Search plan for all three search paths. + */ +struct SemanticSearchPlan { + SearchQuery fileNameQuery; + SearchOptions fileNameOptions; + std::optional contentQuery; + std::optional contentOptions; + std::optional ocrQuery; + std::optional ocrOptions; + TimeField timeField = TimeField::ModifyTime; // BirthTime, ModifyTime, or Both + QStringList searchDirectories; // Empty = use default homePath + bool includeHidden = false; // For trash directory +}; + +/** + * @brief Converts a ParsedIntent into concrete SearchQuery + SearchOptions for each path. + */ +class SemanticQueryBuilder +{ +public: + SemanticQueryBuilder(); + ~SemanticQueryBuilder(); + + /** + * @brief Build a search plan from parsed intent. + * @param intent The parsed intent + * @return A search plan with queries and options for each search path + */ + SemanticSearchPlan build(const ParsedIntent &intent); + +private: + TimeRangeFilter buildTimeRangeFilter(const TimeConstraint &tc) const; + SizeRangeFilter buildSizeRangeFilter(const SizeConstraint &sc) const; + SearchOptions buildBaseOptions(const TimeConstraint &tc, const SizeConstraint &sc) const; +}; + +DFM_SEARCH_END_NS + +#endif // SEMANTICQUERYBUILDER_H diff --git a/src/dfm-search/dfm-search-lib/semantic/semanticruleengine.cpp b/src/dfm-search/dfm-search-lib/semantic/semanticruleengine.cpp new file mode 100644 index 00000000..8980e5a6 --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/semanticruleengine.cpp @@ -0,0 +1,244 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "semanticruleengine.h" +#include "ruleconfigloader.h" + +#include +#include +#include + +DFM_SEARCH_BEGIN_NS + +SemanticRuleEngine::SemanticRuleEngine(QObject *parent) + : QObject(parent) +{ +} + +SemanticRuleEngine::~SemanticRuleEngine() = default; + +bool SemanticRuleEngine::loadRules() +{ + QMap newGroups; + + for (const QString &path : RuleConfigLoader::ruleFilePaths()) { + QList loaded; + if (!RuleConfigLoader::loadRuleFile(path, loaded)) { + qWarning() << "Failed to load rule file:" << path; + continue; + } + + for (RuleGroup &group : loaded) { + if (newGroups.contains(group.name)) { + // Merge: later rules override by ID + for (const Rule &rule : group.rules) { + auto &existingRules = newGroups[group.name].rules; + bool replaced = false; + for (int i = 0; i < existingRules.size(); ++i) { + if (existingRules[i].id == rule.id) { + existingRules[i] = rule; + replaced = true; + break; + } + } + if (!replaced) { + existingRules.append(rule); + } + } + } else { + newGroups.insert(group.name, std::move(group)); + } + + m_ruleFilePaths.insert(group.name, path); + } + } + + if (newGroups.isEmpty()) { + qWarning() << "No rule files loaded, keeping existing rules"; + return !m_groups.isEmpty(); + } + + m_groups = newGroups; + + return true; +} + +bool SemanticRuleEngine::loadRuleFile(const QString &path) +{ + QList loaded; + if (!RuleConfigLoader::loadRuleFile(path, loaded)) { + qWarning() << "Failed to load rule file:" << path; + return false; + } + + for (RuleGroup &group : loaded) { + if (m_groups.contains(group.name)) { + // Merge: later rules override by ID + for (const Rule &rule : group.rules) { + auto &existingRules = m_groups[group.name].rules; + bool replaced = false; + for (int i = 0; i < existingRules.size(); ++i) { + if (existingRules[i].id == rule.id) { + existingRules[i] = rule; + replaced = true; + break; + } + } + if (!replaced) { + existingRules.append(rule); + } + } + } else { + m_groups.insert(group.name, std::move(group)); + } + + m_ruleFilePaths.insert(group.name, path); + } + + return true; +} + +bool SemanticRuleEngine::match(const QString &group, const QString &input, QRegularExpressionMatch &outMatch, + QString *outRuleId) +{ + if (!m_groups.contains(group)) { + return false; + } + + const QList &rules = m_groups.value(group).rules; + QList sorted = rules; + std::stable_sort(sorted.begin(), sorted.end(), + [](const Rule &a, const Rule &b) { return a.priority > b.priority; }); + + for (const Rule &rule : sorted) { + if (!rule.enabled || !rule.regex.isValid()) { + continue; + } + QRegularExpressionMatch m = rule.regex.match(input); + if (m.hasMatch()) { + outMatch = m; + if (outRuleId) { + *outRuleId = rule.id; + } + return true; + } + } + + return false; +} + +QList SemanticRuleEngine::matchAll(const QString &group, const QString &input, + QStringList *outRuleIds) +{ + QList results; + + if (!m_groups.contains(group)) { + return results; + } + + const QList &rules = m_groups.value(group).rules; + QList sorted = rules; + std::stable_sort(sorted.begin(), sorted.end(), + [](const Rule &a, const Rule &b) { return a.priority > b.priority; }); + + for (const Rule &rule : sorted) { + if (!rule.enabled || !rule.regex.isValid()) { + continue; + } + + // Use globalMatch to find ALL occurrences of this rule's pattern. + // This is important for noise rules (e.g., "和" appearing multiple times). + auto it = rule.regex.globalMatch(input); + while (it.hasNext()) { + QRegularExpressionMatch m = it.next(); + if (m.hasMatch()) { + results.append(m); + if (outRuleIds) { + outRuleIds->append(rule.id); + } + } + } + } + + return results; +} + +QVariantMap SemanticRuleEngine::ruleMetadata(const QString &group, const QString &ruleId) const +{ + if (!m_groups.contains(group)) { + return {}; + } + + for (const Rule &rule : m_groups.value(group).rules) { + if (rule.id == ruleId) { + return rule.metadata; + } + } + return {}; +} + +bool SemanticRuleEngine::hasGroup(const QString &group) const +{ + return m_groups.contains(group); +} + +QStringList SemanticRuleEngine::ruleIds(const QString &group) const +{ + const auto it = m_groups.constFind(group); + if (it == m_groups.constEnd()) { + return {}; + } + + QStringList ids; + for (const Rule &rule : it->rules) { + ids.append(rule.id); + } + return ids; +} + +QStringList SemanticRuleEngine::groupNames() const +{ + return m_groups.keys(); +} + +bool SemanticRuleEngine::parseRuleGroupStatic(const QJsonObject &groupObj, RuleGroup &outGroup) +{ + if (!groupObj.contains("name") || !groupObj.contains("rules")) { + return false; + } + + outGroup.name = groupObj.value("name").toString(); + outGroup.version = groupObj.value("version").toString("1.0.0"); + outGroup.locale = groupObj.value("locale").toString(); + + const QJsonArray rulesArray = groupObj.value("rules").toArray(); + for (const QJsonValue &rv : rulesArray) { + const QJsonObject ruleObj = rv.toObject(); + + Rule rule; + rule.id = ruleObj.value("id").toString(); + rule.pattern = ruleObj.value("pattern").toString(); + rule.description = ruleObj.value("description").toString(); + rule.enabled = ruleObj.value("enabled").toBool(true); + rule.priority = ruleObj.value("priority").toInt(0); + rule.metadata = ruleObj.value("metadata").toVariant().toMap(); + + if (rule.pattern.isEmpty() || rule.id.isEmpty()) { + continue; + } + + rule.regex.setPattern(rule.pattern); + rule.regex.setPatternOptions(QRegularExpression::CaseInsensitiveOption); + if (!rule.regex.isValid()) { + qWarning() << "Invalid regex for rule" << rule.id << ":" << rule.regex.errorString(); + continue; + } + + outGroup.rules.append(rule); + } + + return !outGroup.rules.isEmpty(); +} + +DFM_SEARCH_END_NS diff --git a/src/dfm-search/dfm-search-lib/semantic/semanticruleengine.h b/src/dfm-search/dfm-search-lib/semantic/semanticruleengine.h new file mode 100644 index 00000000..6f172e6b --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/semanticruleengine.h @@ -0,0 +1,118 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef SEMANTICRULEENGINE_H +#define SEMANTICRULEENGINE_H + +#include +#include +#include +#include +#include + +#include + +DFM_SEARCH_BEGIN_NS + +struct Rule { + QString id; + QString pattern; + QString description; + bool enabled = true; + int priority = 0; + QVariantMap metadata; + QRegularExpression regex; +}; + +struct RuleGroup { + QString name; + QString version; + QString locale; + QList rules; +}; + +/** + * @brief Rule engine that loads regex rules from JSON config files. + * + * Provides match/matchAll operations with priority-based ordering. + */ +class SemanticRuleEngine : public QObject +{ + Q_OBJECT + +public: + explicit SemanticRuleEngine(QObject *parent = nullptr); + ~SemanticRuleEngine() override; + + /** + * @brief Load rules from all rule files in the config directory. + * @return true if at least one valid rule file was loaded. + */ + bool loadRules(); + + /** + * @brief Load rules from a specific rule file. + * Useful for testing or loading custom rule files. + * Merges with any previously loaded rules by group name. + * @param path Absolute path to a JSON rule file + * @return true if the file was loaded successfully + */ + bool loadRuleFile(const QString &path); + + /** + * @brief Find the highest-priority matching rule in a group. + * @param group The rule group name + * @param input The input text to match against + * @param outMatch Output: the regex match result + * @param outRuleId Output: the matched rule's ID (optional) + * @return true if a match was found + */ + bool match(const QString &group, const QString &input, QRegularExpressionMatch &outMatch, + QString *outRuleId = nullptr); + + /** + * @brief Find all matching rules in a group (priority order). + * @param group The rule group name + * @param input The input text to match against + * @param outRuleIds Output: matched rule IDs (optional, parallel to result list) + * @return List of all matches + */ + QList matchAll(const QString &group, const QString &input, + QStringList *outRuleIds = nullptr); + + /** + * @brief Get a rule's metadata by group and rule ID. + */ + QVariantMap ruleMetadata(const QString &group, const QString &ruleId) const; + + /** + * @brief Get all rule IDs in a group. + */ + QStringList ruleIds(const QString &group) const; + + /** + * @brief Check if a rule group exists and has enabled rules. + */ + bool hasGroup(const QString &group) const; + + /** + * @brief Get the list of loaded rule group names. + */ + QStringList groupNames() const; + + /** + * @brief Static helper to parse a rule group from JSON. + */ + static bool parseRuleGroupStatic(const QJsonObject &groupObj, RuleGroup &outGroup); + +private: + bool parseRuleGroup(const QJsonObject &groupObj, RuleGroup &outGroup); + + QMap m_groups; + QMap m_ruleFilePaths; // group name -> resolved file path +}; + +DFM_SEARCH_END_NS + +#endif // SEMANTICRULEENGINE_H diff --git a/src/dfm-search/dfm-search-lib/semantic/semanticsearcher.cpp b/src/dfm-search/dfm-search-lib/semantic/semanticsearcher.cpp new file mode 100644 index 00000000..8f9df560 --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/semanticsearcher.cpp @@ -0,0 +1,368 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#include +#include "semanticsearcher_p.h" +#include "semanticquerybuilder.h" +#include "intentparser.h" +#include "semanticruleengine.h" + +#include +#include + +#include +#include +#include +#include + +DFM_SEARCH_BEGIN_NS + +SemanticSearcherData::SemanticSearcherData(SemanticSearcher *q_ptr) + : q(q_ptr), ruleEngine(new SemanticRuleEngine(q)), intentParser(new IntentParser(ruleEngine)), queryBuilder(new SemanticQueryBuilder()), timeoutTimer(new QTimer(q)) +{ + timeoutTimer->setSingleShot(true); + timeoutTimer->setInterval(timeoutSeconds * 1000); + + QObject::connect(timeoutTimer, &QTimer::timeout, q, [this]() { + qWarning() << "Semantic search timed out after" << timeoutSeconds << "seconds"; + doCancel(); + }); + + // Load rules + if (!ruleEngine->loadRules()) { + qWarning() << "Failed to load semantic rules"; + } +} + +SemanticSearcherData::~SemanticSearcherData() +{ + doCancel(); +} + +void SemanticSearcherData::doSearch(const QString &naturalLanguage, const QStringList &searchDirectories) +{ + if (naturalLanguage.trimmed().isEmpty()) { + Q_EMIT q->errorOccurred(SearchError(SearchErrorCode::InvalidQuery)); + return; + } + + // Step 1: Validate + reset state + cancelled.store(false); + allResults.clear(); + seenPaths.clear(); + status.store(SearchStatus::Searching); + Q_EMIT q->statusChanged(SearchStatus::Searching); + + // Step 2: Parse natural language into intent (before searchStarted + // so that intentParsed listeners have the data when searchStarted fires) + ParsedIntent intent; + intentParser->parse(naturalLanguage, intent); + Q_EMIT q->intentParsed(intent); + + Q_EMIT q->searchStarted(); + + // Step 3: Build search plan + const SemanticSearchPlan plan = queryBuilder->build(intent); + + // Step 4: Determine search directories + // Priority: caller-specified directories > NLP-parsed directories > home directory + QStringList dirs; + if (!searchDirectories.isEmpty()) { + dirs = searchDirectories; + } else if (!plan.searchDirectories.isEmpty()) { + dirs = plan.searchDirectories; + } else { + dirs = QStringList { QDir::homePath() }; + } + + // Step 5: Set up signal/slot handlers + auto onFinished = [this](const SearchResultList &results) { + // Collect and deduplicate results from each engine's final result list + for (const SearchResult &r : results) { + if (!seenPaths.contains(r.path())) { + seenPaths.insert(r.path()); + allResults.append(r); + } + } + + if (pendingFinishCount.fetch_sub(1) == 1) { + // All engines finished + timeoutTimer->stop(); + + // Truncate final deduplicated results to maxResults + if (maxResults > 0 && allResults.size() > maxResults) { + allResults = allResults.mid(0, maxResults); + } + + if (cancelled.load()) { + status.store(SearchStatus::Cancelled); + Q_EMIT q->statusChanged(SearchStatus::Cancelled); + Q_EMIT q->searchCancelled(); + } else { + status.store(SearchStatus::Finished); + Q_EMIT q->statusChanged(SearchStatus::Finished); + Q_EMIT q->searchFinished(allResults); + } + } + }; + + auto onError = [](const SearchError &error) { + qWarning() << "Search error:" << error.message(); + // Don't propagate individual engine errors to caller + // The other engines may still produce valid results + }; + + // Step 6: Clean up any previous search engines + pendingFinishCount.store(0); + for (SearchEngine *e : engines) { + e->deleteLater(); + } + engines.clear(); + + // Step 7: Helper to prepare options with multi-path and hidden settings + auto prepareOptions = [&dirs, &plan](const SearchOptions &baseOpts) -> SearchOptions { + SearchOptions opts = baseOpts; + opts.setSearchPaths(dirs); + if (plan.includeHidden) { + opts.setIncludeHidden(true); + } + return opts; + }; + + // Apply caller-level options + auto applyCallerOptions = [this](SearchOptions &opts) { + if (detailedResultsEnabled) { + opts.setDetailedResultsEnabled(true); + } + if (maxResults > 0) { + opts.setMaxResults(maxResults); + } + }; + + // Step 8: Launch up to 3 engines (FileName, Content, OCR) + // TimeField::Both is no longer expanded here; it is handled by the Lucene strategy layer. + // Multiple directories are passed via setSearchPaths(). + + // File name search (always, if index is ready) + if (Global::isFileNameIndexReadyForSearch()) { + SearchOptions fnameOpts = prepareOptions(plan.fileNameOptions); + applyCallerOptions(fnameOpts); + createAndLaunchEngine(SearchType::FileName, plan.fileNameQuery, + fnameOpts, onFinished, onError); + } + + // Content search + if (plan.contentQuery.has_value() && plan.contentOptions.has_value()) { + SearchOptions contentOpts = prepareOptions(*plan.contentOptions); + applyCallerOptions(contentOpts); + createAndLaunchEngine(SearchType::Content, *plan.contentQuery, + contentOpts, onFinished, onError); + } + + // OCR search + if (plan.ocrQuery.has_value() && plan.ocrOptions.has_value()) { + SearchOptions ocrOpts = prepareOptions(*plan.ocrOptions); + applyCallerOptions(ocrOpts); + createAndLaunchEngine(SearchType::Ocr, *plan.ocrQuery, + ocrOpts, onFinished, onError); + } + + // Step 9: Handle no-engine case + if (pendingFinishCount.load() == 0) { + timeoutTimer->stop(); + status.store(SearchStatus::Finished); + Q_EMIT q->statusChanged(SearchStatus::Finished); + Q_EMIT q->searchFinished({}); + } else { + if (timeoutSeconds > 0) { + timeoutTimer->start(); + } + } +} + +void SemanticSearcherData::createAndLaunchEngine( + SearchType type, + const SearchQuery &query, + const SearchOptions &options, + std::function onFinished, + std::function onError) +{ + SearchEngine *engine = SearchEngine::create(type, q); + engine->setSearchOptions(options); + + QObject::connect(engine, &SearchEngine::searchFinished, q, onFinished); + QObject::connect(engine, &SearchEngine::errorOccurred, q, onError); + + engines.append(engine); + pendingFinishCount.fetch_add(1); + engine->search(query); +} + +void SemanticSearcherData::doCancel() +{ + cancelled.store(true); + timeoutTimer->stop(); + + for (SearchEngine *e : engines) { + e->cancel(); + } +} + +// --- SemanticSearcher public API --- + +SemanticSearcher::SemanticSearcher(QObject *parent) + : QObject(parent), d_ptr(new SemanticSearcherData(this)) +{ +} + +SemanticSearcher::~SemanticSearcher() = default; + +SearchStatus SemanticSearcher::status() const +{ + return d_ptr->status.load(); +} + +void SemanticSearcher::setSearchTimeout(int seconds) +{ + d_ptr->timeoutSeconds = seconds; + d_ptr->timeoutTimer->setInterval(seconds * 1000); +} + +int SemanticSearcher::searchTimeout() const +{ + return d_ptr->timeoutSeconds; +} + +void SemanticSearcher::search(const QString &naturalLanguage) +{ + if (d_ptr->status.load() == SearchStatus::Searching) { + qWarning() << "Search already in progress"; + return; + } + + d_ptr->doSearch(naturalLanguage, {}); +} + +void SemanticSearcher::search(const QString &naturalLanguage, const QStringList &searchDirectories) +{ + if (d_ptr->status.load() == SearchStatus::Searching) { + qWarning() << "Search already in progress"; + return; + } + + d_ptr->doSearch(naturalLanguage, searchDirectories); +} + +bool SemanticSearcher::isSemanticQuery(const QString &input) const +{ + if (input.trimmed().isEmpty()) { + return false; + } + + ParsedIntent intent; + d_ptr->intentParser->parse(input, intent); + + return intent.timeConstraint.isValid() + || intent.sizeConstraint.isValid() + || !intent.fileExtensions.isEmpty() + || !intent.searchDirectories.isEmpty(); +} + +void SemanticSearcher::cancel() +{ + d_ptr->doCancel(); +} + +void SemanticSearcher::setDetailedResultsEnabled(bool enable) +{ + d_ptr->detailedResultsEnabled = enable; +} + +bool SemanticSearcher::isDetailedResultsEnabled() const +{ + return d_ptr->detailedResultsEnabled; +} + +void SemanticSearcher::setMaxResults(int count) +{ + d_ptr->maxResults = count; +} + +int SemanticSearcher::maxResults() const +{ + return d_ptr->maxResults; +} + +SearchResultExpected SemanticSearcher::searchSync(const QString &naturalLanguage) +{ + return searchSync(naturalLanguage, {}); +} + +SearchResultExpected SemanticSearcher::searchSync(const QString &naturalLanguage, const QStringList &searchDirectories) +{ + if (d_ptr->status.load() == SearchStatus::Searching) { + qWarning() << "Search already in progress"; + return Dtk::Core::DUnexpected(SearchError(SearchErrorCode::InvalidQuery)); + } + + if (naturalLanguage.trimmed().isEmpty()) { + return Dtk::Core::DUnexpected(SearchError(SearchErrorCode::InvalidQuery)); + } + + SearchResultList results; + bool hasError = false; + SearchError lastError; + bool cancelled = false; + bool done = false; + + QEventLoop eventLoop; + + // Use a shared guard flag so late-arriving signals after eventLoop exits are harmless. + // The internal doSearch timeout mechanism is relied upon for actual cancellation. + QObject::connect(this, &SemanticSearcher::searchFinished, this, + [&](const SearchResultList &r) { + if (!done) { + results = r; + done = true; + eventLoop.quit(); + } + }); + + QObject::connect(this, &SemanticSearcher::searchCancelled, this, + [&]() { + if (!done) { + cancelled = true; + done = true; + eventLoop.quit(); + } + }); + + QObject::connect(this, &SemanticSearcher::errorOccurred, this, + [&](const SearchError &error) { + if (!done) { + hasError = true; + lastError = error; + done = true; + eventLoop.quit(); + } + }); + + // Start the async search (uses internal timeout mechanism) + d_ptr->doSearch(naturalLanguage, searchDirectories); + + // Block until completion, cancellation, or error + eventLoop.exec(); + + if (cancelled) { + return results; + } + + if (hasError) { + return Dtk::Core::DUnexpected(lastError); + } + + return results; +} + +DFM_SEARCH_END_NS diff --git a/src/dfm-search/dfm-search-lib/semantic/semanticsearcher_p.h b/src/dfm-search/dfm-search-lib/semantic/semanticsearcher_p.h new file mode 100644 index 00000000..dd146ff2 --- /dev/null +++ b/src/dfm-search/dfm-search-lib/semantic/semanticsearcher_p.h @@ -0,0 +1,84 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef SEMANTICSEARCHER_P_H +#define SEMANTICSEARCHER_P_H + +#include +#include + +#include + +#include +#include +#include +#include + +DFM_SEARCH_BEGIN_NS + +class SemanticRuleEngine; +class IntentParser; +class SemanticQueryBuilder; +class SemanticSearchPlan; +class SemanticSearcher; + +class SemanticSearcherData +{ +public: + explicit SemanticSearcherData(SemanticSearcher *q); + ~SemanticSearcherData(); + + void doSearch(const QString &naturalLanguage, const QStringList &searchDirectories); + void doCancel(); + + /** + * @brief Create, configure, and launch a search engine + * + * Creates a SearchEngine of the given type, sets its options, connects + * signal/slot handlers, appends it to the engines list, increments the + * pending finish counter, and starts the search. + * + * @param type The search engine type (FileName, Content, or Ocr) + * @param query The search query to execute + * @param options The search options (including multi-path and time filter) + * @param onFinished Callback for engine completion and result collection + * @param onError Callback for error handling + */ + void createAndLaunchEngine(SearchType type, + const SearchQuery &query, + const SearchOptions &options, + std::function onFinished, + std::function onError); + + SemanticSearcher *q = nullptr; + + // State + std::atomic status { SearchStatus::Ready }; + std::atomic cancelled { false }; + int timeoutSeconds = 60; + + // Core components (owned) + SemanticRuleEngine *ruleEngine = nullptr; + IntentParser *intentParser = nullptr; + SemanticQueryBuilder *queryBuilder = nullptr; + + // Sub-engines (owned per search, parented to q for auto-cleanup) + QList engines; + std::atomic pendingFinishCount { 0 }; + + // Result collection + SearchResultList allResults; + QSet seenPaths; + + // Timeout + QTimer *timeoutTimer = nullptr; + + // Options forwarded from caller + bool detailedResultsEnabled = false; + int maxResults = 0; // 0 = unlimited +}; + +DFM_SEARCH_END_NS + +#endif // SEMANTICSEARCHER_P_H diff --git a/src/dfm-search/dfm-search-lib/textsearch/textsearchapi.cpp b/src/dfm-search/dfm-search-lib/textsearch/textsearchapi.cpp index e2182d9f..da5087b9 100644 --- a/src/dfm-search/dfm-search-lib/textsearch/textsearchapi.cpp +++ b/src/dfm-search/dfm-search-lib/textsearch/textsearchapi.cpp @@ -45,8 +45,28 @@ bool TextSearchOptionsAPI::isFullTextRetrievalEnabled() const return m_options.customOption("fullTextRetrieval").toBool(); } +void TextSearchOptionsAPI::setFilenameKeyword(const QString &keyword) +{ + m_options.setCustomOption("filenameKeyword", keyword); +} + +QString TextSearchOptionsAPI::filenameKeyword() const +{ + return m_options.customOption("filenameKeyword").toString(); +} + // ==================== TextSearchResultAPI ==================== +void TextSearchResultAPI::setFileSizeBytes(qint64 bytes) +{ + m_result.setCustomAttribute("fileSizeBytes", bytes); +} + +qint64 TextSearchResultAPI::fileSizeBytes() const +{ + return m_result.customAttribute("fileSizeBytes").toLongLong(); +} + TextSearchResultAPI::TextSearchResultAPI(SearchResult &result) : m_result(result) { diff --git a/src/dfm-search/dfm-search-lib/utils/contenthighlighter.cpp b/src/dfm-search/dfm-search-lib/utils/contenthighlighter.cpp index 889e44f5..cc95dd14 100644 --- a/src/dfm-search/dfm-search-lib/utils/contenthighlighter.cpp +++ b/src/dfm-search/dfm-search-lib/utils/contenthighlighter.cpp @@ -14,35 +14,12 @@ #include #include -#include "chineseanalyzer.h" - using namespace Lucene; DFM_SEARCH_BEGIN_NS namespace ContentHighlighter { -namespace { -QString mergeAdjacentHighlightTags(const QString &text) -{ - // 使用正则表达式搜索和替换相邻的高亮标签 - QString result = text; - - // 替换模式: 将被删除,从而合并相邻的标签 - static const QString pattern = QLatin1String(""); - static const QString replacement = QLatin1String(""); - - // 循环替换直到不再有变化(处理连续多个标签的情况) - QString previousResult; - do { - previousResult = result; - result = result.replace(pattern, replacement); - } while (result != previousResult); - - return result; -} -} // namespace - namespace { struct KeywordMatch @@ -265,50 +242,6 @@ QString customHighlight(const QStringList &keywords, const QString &content, int return resultSnippet; } -QString highlight(const QString &content, const Lucene::QueryPtr &query, int maxLength, bool enableHtml) -{ - try { - if (content.isEmpty()) { - return {}; - } - - // 尝试使用Lucene高亮器 - FormatterPtr formatter; - if (enableHtml) { - formatter = newLucene(L"", L""); - } else { - formatter = newLucene(L"", L""); - } - HighlighterScorerPtr scorer = newLucene(query); - HighlighterPtr highlighter = newLucene(formatter, scorer); - - // 创建分析器 - AnalyzerPtr analyzer = newLucene(); - - TokenStreamPtr tokenStream = analyzer->tokenStream(L"contents", newLucene(content.toStdWString())); - Collection fragments = highlighter->getBestFragments(tokenStream, content.toStdWString(), 1); - - QString result; - if (!fragments.empty() && !fragments[0].empty()) { - // Lucene高亮成功,使用其结果 - result = QString::fromStdWString(fragments[0]); - } else { - // TODO: Lucene高亮失败,使用自定义高亮方法 - // result = customHighlight(content, query, contentLength); - } - - // 处理连续的高亮标签 - if (enableHtml) { - result = mergeAdjacentHighlightTags(result); - } - - return result.simplified(); - } catch (const LuceneException &e) { - qWarning() << "Highlighting failed:" << QString::fromStdWString(e.getError()); - return QStringLiteral("(Error highlighting content)"); - } -} - } // namespace ContentHighlighter DFM_SEARCH_END_NS diff --git a/src/dfm-search/dfm-search-lib/utils/contenthighlighter.h b/src/dfm-search/dfm-search-lib/utils/contenthighlighter.h index e35a5241..e926fd66 100644 --- a/src/dfm-search/dfm-search-lib/utils/contenthighlighter.h +++ b/src/dfm-search/dfm-search-lib/utils/contenthighlighter.h @@ -39,16 +39,6 @@ namespace ContentHighlighter { */ QString customHighlight(const QStringList &keywords, const QString &content, int maxLength, bool enableHtml); -/** - * @brief 高亮搜索结果中的关键词 - * @param content 要高亮的内容 - * @param query Lucene查询对象 - * @param maxLength 最大显示长度 - * @param enableHtml 是否启用HTML标签高亮,默认为false - * @return 高亮后的内容 - */ -QString highlight(const QString &content, const Lucene::QueryPtr &query, int maxLength, bool enableHtml); - } // namespace ContentHighlighter DFM_SEARCH_END_NS diff --git a/src/dfm-search/dfm-search-lib/utils/contentretriever.cpp b/src/dfm-search/dfm-search-lib/utils/contentretriever.cpp new file mode 100644 index 00000000..8dbc7c4a --- /dev/null +++ b/src/dfm-search/dfm-search-lib/utils/contentretriever.cpp @@ -0,0 +1,333 @@ +// SPDX-FileCopyrightText: 2026 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#include +#include +#include + +#include "utils/contenthighlighter.h" + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace Lucene; + +DFM_SEARCH_BEGIN_NS + +namespace { + +const wchar_t *contentFieldName(SearchType type) +{ + return (type == SearchType::Ocr) + ? LuceneFieldNames::OcrText::kOcrContents + : LuceneFieldNames::Content::kContents; +} + +const wchar_t *pathFieldName(SearchType type) +{ + return (type == SearchType::Ocr) + ? LuceneFieldNames::OcrText::kPath + : LuceneFieldNames::Content::kPath; +} + +QString defaultIndexDirectoryForType(SearchType type) +{ + return (type == SearchType::Ocr) + ? Global::ocrTextIndexDirectory() + : Global::contentIndexDirectory(); +} + +QString storedContentFromDocument(const DocumentPtr &doc, SearchType type) +{ + if (!doc) { + return {}; + } + + const String contentField = doc->get(contentFieldName(type)); + if (contentField.empty()) { + return {}; + } + + return QString::fromStdWString(contentField); +} + +QStringList splitKeywords(const QString &keyword) +{ + if (keyword.isEmpty()) { + return {}; + } +#if QT_VERSION >= QT_VERSION_CHECK(5, 15, 0) + return keyword.split(',', Qt::SkipEmptyParts); +#else + return keyword.split(',', QString::SkipEmptyParts); +#endif +} + +DocumentPtr findDocumentByPath(const SearcherPtr &searcher, + const QString &path, + SearchType type) +{ + TermPtr term = newLucene(pathFieldName(type), path.toStdWString()); + QueryPtr query = newLucene(term); + + TopDocsPtr topDocs = searcher->search(query, 1); + if (!topDocs || topDocs->totalHits == 0) { + return nullptr; + } + + return searcher->doc(topDocs->scoreDocs[0]->doc); +} + +struct CachedIndexContext +{ + QString indexDirectory; + FSDirectoryPtr directory; + IndexReaderPtr reader; + SearcherPtr searcher; +}; + +} // namespace + +struct ContentRetriever::Private +{ + QString contentIndexDirectory; + QString ocrIndexDirectory; + mutable QMutex mutex; + mutable QHash cacheByType; + + CachedIndexContext *ensureIndexContext(SearchType type, const QString &indexDir) const + { + CachedIndexContext &ctx = cacheByType[static_cast(type)]; + if (ctx.searcher && ctx.reader && ctx.directory && ctx.indexDirectory == indexDir) { + try { + if (!ctx.reader->isCurrent()) { + IndexReaderPtr reopened = ctx.reader->reopen(true); + if (reopened != ctx.reader) { + ctx.reader = reopened; + ctx.searcher = newLucene(ctx.reader); + } + } + return &ctx; + } catch (const LuceneException &e) { + qWarning() << "ContentRetriever: failed to refresh index reader" + << QString::fromStdWString(e.getError()); + ctx = {}; + } catch (const std::exception &e) { + qWarning() << "ContentRetriever: failed to refresh index reader" << e.what(); + ctx = {}; + } + } + + try { + ctx.indexDirectory = indexDir; + ctx.directory = FSDirectory::open(indexDir.toStdWString()); + if (!IndexReader::indexExists(ctx.directory)) { + ctx = {}; + return nullptr; + } + + ctx.reader = IndexReader::open(ctx.directory, true); + ctx.searcher = newLucene(ctx.reader); + return &ctx; + } catch (const LuceneException &e) { + qWarning() << "ContentRetriever: failed to open index" + << QString::fromStdWString(e.getError()); + } catch (const std::exception &e) { + qWarning() << "ContentRetriever: failed to open index" << e.what(); + } + + ctx = {}; + return nullptr; + } +}; + +ContentRetriever::ContentRetriever(QObject *parent) + : QObject(parent), + d(std::make_unique()) +{ +} + +ContentRetriever::~ContentRetriever() = default; + +void ContentRetriever::setIndexDirectory(SearchType type, const QString &indexDirectory) +{ + if (type != SearchType::Content && type != SearchType::Ocr) { + return; + } + + QMutexLocker locker(&d->mutex); + if (type == SearchType::Ocr) { + d->ocrIndexDirectory = indexDirectory; + } else { + d->contentIndexDirectory = indexDirectory; + } + d->cacheByType.remove(static_cast(type)); +} + +QString ContentRetriever::indexDirectory(SearchType type) const +{ + if (type == SearchType::Ocr) { + return d->ocrIndexDirectory.isEmpty() + ? defaultIndexDirectoryForType(type) + : d->ocrIndexDirectory; + } + if (type == SearchType::Content) { + return d->contentIndexDirectory.isEmpty() + ? defaultIndexDirectoryForType(type) + : d->contentIndexDirectory; + } + + return {}; +} + +QString ContentRetriever::fetchHighlight(const QString &path, + const QString &keyword, + SearchType type, + const HighlightOptions &options) const +{ + if (path.isEmpty() || keyword.isEmpty()) return {}; + if (type != SearchType::Content && type != SearchType::Ocr) return {}; + + const QStringList keywords = splitKeywords(keyword); + if (keywords.isEmpty()) return {}; + + const QString indexDir = indexDirectory(type); + + QMutexLocker locker(&d->mutex); + CachedIndexContext *ctx = d->ensureIndexContext(type, indexDir); + if (!ctx || !ctx->searcher) { + return {}; + } + + try { + const DocumentPtr doc = findDocumentByPath(ctx->searcher, path, type); + const QString content = storedContentFromDocument(doc, type); + if (content.isEmpty()) { + return {}; + } + + return ContentHighlighter::customHighlight( + keywords, content, options.maxPreviewLength, options.enableHtml); + } catch (const LuceneException &e) { + qWarning() << "ContentRetriever: error fetching highlight for" << path + << QString::fromStdWString(e.getError()); + return {}; + } catch (const std::exception &e) { + qWarning() << "ContentRetriever: std error for" << path << e.what(); + return {}; + } +} + +QMap ContentRetriever::fetchHighlights(const QStringList &paths, + const QString &keyword, + SearchType type, + const HighlightOptions &options) const +{ + QMap results; + if (paths.isEmpty() || keyword.isEmpty()) return results; + if (type != SearchType::Content && type != SearchType::Ocr) return results; + + const QStringList keywords = splitKeywords(keyword); + if (keywords.isEmpty()) return results; + + const QString indexDir = indexDirectory(type); + + QMutexLocker locker(&d->mutex); + CachedIndexContext *ctx = d->ensureIndexContext(type, indexDir); + if (!ctx || !ctx->searcher) { + return results; + } + + for (const QString &path : paths) { + try { + const DocumentPtr doc = findDocumentByPath(ctx->searcher, path, type); + const QString content = storedContentFromDocument(doc, type); + if (content.isEmpty()) { + results.insert(path, {}); + continue; + } + + results.insert(path, ContentHighlighter::customHighlight( + keywords, content, options.maxPreviewLength, options.enableHtml)); + } catch (const LuceneException &e) { + qWarning() << "ContentRetriever: error for" << path + << QString::fromStdWString(e.getError()); + results.insert(path, {}); + } catch (const std::exception &e) { + qWarning() << "ContentRetriever: std error for" << path << e.what(); + results.insert(path, {}); + } + } + + return results; +} + +QString ContentRetriever::fetchContent(const QString &path, SearchType type) const +{ + if (path.isEmpty()) return {}; + if (type != SearchType::Content && type != SearchType::Ocr) return {}; + + const QString indexDir = indexDirectory(type); + + QMutexLocker locker(&d->mutex); + CachedIndexContext *ctx = d->ensureIndexContext(type, indexDir); + if (!ctx || !ctx->searcher) { + return {}; + } + + try { + return storedContentFromDocument(findDocumentByPath(ctx->searcher, path, type), type); + } catch (const LuceneException &e) { + qWarning() << "ContentRetriever: error fetching content for" << path + << QString::fromStdWString(e.getError()); + return {}; + } catch (const std::exception &e) { + qWarning() << "ContentRetriever: std error for" << path << e.what(); + return {}; + } +} + +QMap ContentRetriever::fetchContents(const QStringList &paths, + SearchType type) const +{ + QMap results; + if (paths.isEmpty()) return results; + if (type != SearchType::Content && type != SearchType::Ocr) return results; + + const QString indexDir = indexDirectory(type); + + QMutexLocker locker(&d->mutex); + CachedIndexContext *ctx = d->ensureIndexContext(type, indexDir); + if (!ctx || !ctx->searcher) { + return results; + } + + for (const QString &path : paths) { + try { + results.insert(path, storedContentFromDocument(findDocumentByPath(ctx->searcher, path, type), type)); + } catch (const LuceneException &e) { + qWarning() << "ContentRetriever: error for" << path + << QString::fromStdWString(e.getError()); + results.insert(path, {}); + } catch (const std::exception &e) { + qWarning() << "ContentRetriever: std error for" << path << e.what(); + results.insert(path, {}); + } + } + + return results; +} + +DFM_SEARCH_END_NS diff --git a/src/dfm-search/dfm-search-lib/utils/lucenequeryutils.cpp b/src/dfm-search/dfm-search-lib/utils/lucenequeryutils.cpp index 0e1a93c2..c4443b3a 100644 --- a/src/dfm-search/dfm-search-lib/utils/lucenequeryutils.cpp +++ b/src/dfm-search/dfm-search-lib/utils/lucenequeryutils.cpp @@ -5,9 +5,42 @@ #include +#include +#include +#include + DFM_SEARCH_BEGIN_NS namespace LuceneQueryUtils { +namespace { + +Lucene::String toLuceneString(const QString &str, bool caseSensitive) +{ + QString normalized = caseSensitive ? str : str.toLower(); + QByteArray utf8Bytes = normalized.toUtf8(); + Lucene::String luceneStr = Lucene::StringUtils::toUnicode(std::string(utf8Bytes.constData(), utf8Bytes.length())); + if (luceneStr.empty()) { + luceneStr = Lucene::StringUtils::toUnicode(normalized.toStdString()); + } + return luceneStr; +} + +Lucene::TermPtr buildTerm(const QString &fieldName, const QString &text, bool caseSensitive) +{ + return Lucene::newLucene( + toLuceneString(fieldName, true), + toLuceneString(text, caseSensitive)); +} + +int phrasePositionForStandardNGram2(int startOffset) +{ + // Standard lucene++ NGramTokenizer(1,2) emits 1-gram then 2-gram at each + // character offset, and every emitted token advances the phrase position by 1. + // Therefore the 2-gram starting at offset i lands at position 2 * i + 1. + return startOffset * 2 + 1; +} + +} // namespace std::wstring getLuceneSpecialChars() { @@ -50,6 +83,32 @@ Lucene::String processQueryString(const QString &str, bool caseSensitive) return luceneStr; } +Lucene::QueryPtr buildNGramSearchQuery(const QString &fieldName, const QString &keyword, bool caseSensitive) +{ + if (fieldName.isEmpty() || keyword.isEmpty()) { + return nullptr; + } + + if (keyword.size() <= 2) { + return Lucene::newLucene( + buildTerm(fieldName, keyword, caseSensitive)); + } + + Lucene::PhraseQueryPtr phraseQuery = Lucene::newLucene(); + for (int pos = 0; pos + 2 <= keyword.size(); pos += 2) { + phraseQuery->add(buildTerm(fieldName, keyword.mid(pos, 2), caseSensitive), + phrasePositionForStandardNGram2(pos)); + } + + if (keyword.size() % 2 != 0) { + const int tailPos = keyword.size() - 2; + phraseQuery->add(buildTerm(fieldName, keyword.mid(tailPos, 2), caseSensitive), + phrasePositionForStandardNGram2(tailPos)); + } + + return phraseQuery; +} + Lucene::QueryPtr buildPathPrefixQuery(const QString &pathPrefix, const QString &fieldName) { if (pathPrefix.isEmpty() || fieldName.isEmpty()) { @@ -65,6 +124,30 @@ Lucene::QueryPtr buildPathPrefixQuery(const QString &pathPrefix, const QString & Lucene::StringUtils::toUnicode(normalizedPath.toStdString()))); } +Lucene::QueryPtr buildMultiPathPrefixQuery(const QStringList &paths, const QString &fieldName) +{ + if (paths.isEmpty() || fieldName.isEmpty()) { + return nullptr; + } + + if (paths.size() == 1) { + return buildPathPrefixQuery(paths.first(), fieldName); + } + + Lucene::BooleanQueryPtr boolQuery = Lucene::newLucene(); + bool hasValid = false; + + for (const QString &path : paths) { + Lucene::QueryPtr pathQuery = buildPathPrefixQuery(path, fieldName); + if (pathQuery) { + boolQuery->add(pathQuery, Lucene::BooleanClause::SHOULD); + hasValid = true; + } + } + + return hasValid ? boolQuery : nullptr; +} + } // namespace LuceneQueryUtils DFM_SEARCH_END_NS diff --git a/src/dfm-search/dfm-search-lib/utils/lucenequeryutils.h b/src/dfm-search/dfm-search-lib/utils/lucenequeryutils.h index 1c5ead25..29c96b5a 100644 --- a/src/dfm-search/dfm-search-lib/utils/lucenequeryutils.h +++ b/src/dfm-search/dfm-search-lib/utils/lucenequeryutils.h @@ -25,6 +25,21 @@ namespace LuceneQueryUtils { */ Lucene::String processQueryString(const QString &str, bool caseSensitive = false); +/** + * @brief Build a query that matches text indexed by NGramAnalyzer(1, 2) + * + * The query is built directly instead of passing user input through an n-gram + * analyzer at search time. One- and two-character keywords use TermQuery. + * Longer keywords use a sparse PhraseQuery over 2-grams to avoid generating + * every overlapping query term. + * + * @param fieldName The indexed field name + * @param keyword The raw user keyword + * @param caseSensitive Whether the search is case sensitive + * @return Lucene query object, or nullptr if fieldName or keyword is empty + */ +Lucene::QueryPtr buildNGramSearchQuery(const QString &fieldName, const QString &keyword, bool caseSensitive = false); + /** * @brief Get a list of Lucene special characters that need escaping * @return List of special characters @@ -39,6 +54,18 @@ std::wstring getLuceneSpecialChars(); */ Lucene::QueryPtr buildPathPrefixQuery(const QString &pathPrefix, const QString &fieldName); +/** + * @brief Build a multi-path prefix query for Lucene + * + * When multiple paths are provided, builds a BooleanQuery with SHOULD clauses + * for each path. When only one path is provided, returns a simple pathPrefixQuery. + * + * @param paths List of path prefixes to search for + * @param fieldName The index field name (e.g., "ancestor_paths") + * @return Lucene query object, or nullptr if paths is empty + */ +Lucene::QueryPtr buildMultiPathPrefixQuery(const QStringList &paths, const QString &fieldName); + } // namespace LuceneQueryUtils DFM_SEARCH_END_NS diff --git a/src/dfm-search/dfm-search-lib/utils/searchutility.cpp b/src/dfm-search/dfm-search-lib/utils/searchutility.cpp index 19303e47..6327f4fb 100644 --- a/src/dfm-search/dfm-search-lib/utils/searchutility.cpp +++ b/src/dfm-search/dfm-search-lib/utils/searchutility.cpp @@ -24,13 +24,6 @@ using namespace Lucene; namespace Global { -// Index version threshold constants -namespace IndexVersionThresholds { -constexpr int FILENAME_ANCESTOR_PATHS = 3; -constexpr int CONTENT_ANCESTOR_PATHS = 1; -constexpr int OCRTEXT_ANCESTOR_PATHS = 1; -} - /** * @brief Read index version from a JSON status file * @param indexDir The index directory path @@ -831,21 +824,6 @@ int ocrTextIndexVersion() namespace SearchUtility { -bool isFilenameIndexAncestorPathsSupported() -{ - return Global::fileNameIndexVersion() > Global::IndexVersionThresholds::FILENAME_ANCESTOR_PATHS; -} - -bool isContentIndexAncestorPathsSupported() -{ - return Global::contentIndexVersion() > Global::IndexVersionThresholds::CONTENT_ANCESTOR_PATHS; -} - -bool isOcrTextIndexAncestorPathsSupported() -{ - return Global::ocrTextIndexVersion() > Global::IndexVersionThresholds::OCRTEXT_ANCESTOR_PATHS; -} - QStringList extractBooleanKeywords(const SearchQuery &query) { QStringList keywords; @@ -876,27 +854,5 @@ QStringList deepinAnythingFileTypes() return kTypes; } -bool shouldUsePathPrefixQuery(const QString &searchPath) -{ - // Don't use path prefix query for root directory - if (searchPath == "/" || searchPath.isEmpty()) { - return false; - } - - // Check if it's one of the default indexed directories - const QStringList &defaultDirs = Global::defaultIndexedDirectory(); - for (const QString &defaultDir : defaultDirs) { - QString normalizedDefault = QDir::cleanPath(defaultDir); - QString normalizedSearch = QDir::cleanPath(searchPath); - - // Don't use path prefix query if search path is one of the default indexed directories - if (normalizedSearch == normalizedDefault) { - return false; - } - } - - return true; -} - } // namespace SearchUtility DFM_SEARCH_END_NS diff --git a/src/dfm-search/dfm-search-lib/utils/searchutility.h b/src/dfm-search/dfm-search-lib/utils/searchutility.h index 1fa74fd8..2707b288 100644 --- a/src/dfm-search/dfm-search-lib/utils/searchutility.h +++ b/src/dfm-search/dfm-search-lib/utils/searchutility.h @@ -33,34 +33,6 @@ QStringList extractBooleanKeywords(const SearchQuery &query); */ QStringList deepinAnythingFileTypes(); -/** - * @brief Check if path prefix query optimization should be used - * @param searchPath The search path - * @return true if path prefix query should be used, false otherwise - */ -bool shouldUsePathPrefixQuery(const QString &searchPath); - -/** - * @brief Check if the filename index supports the ancestor_paths field. - * This function checks the filename index version and returns true if the version is greater than 3. - * @return true if the filename index supports ancestor_paths, false otherwise. - */ -bool isFilenameIndexAncestorPathsSupported(); - -/** - * @brief Check if the content index supports the ancestor_paths field. - * This function checks the content index version and returns true if the version is greater than 1. - * @return true if the content index supports ancestor_paths, false otherwise. - */ -bool isContentIndexAncestorPathsSupported(); - -/** - * @brief Check if the OCR text index supports the ancestor_paths field. - * This function checks the OCR text index version and returns true if the version supports ancestor_paths. - * @return true if the OCR text index supports ancestor_paths, false otherwise. - */ -bool isOcrTextIndexAncestorPathsSupported(); - } // namespace SearchUtility DFM_SEARCH_END_NS diff --git a/src/dfm-search/dfm-search-lib/utils/timerangeutils.cpp b/src/dfm-search/dfm-search-lib/utils/timerangeutils.cpp index 3fddbf35..d13e5a95 100644 --- a/src/dfm-search/dfm-search-lib/utils/timerangeutils.cpp +++ b/src/dfm-search/dfm-search-lib/utils/timerangeutils.cpp @@ -5,6 +5,7 @@ #include #include +#include DFM_SEARCH_BEGIN_NS @@ -38,6 +39,51 @@ Lucene::QueryPtr buildNumericRangeQuery( includeUpper); } +Lucene::QueryPtr buildTimeRangeFilterQuery( + const TimeRangeFilter &filter, + const wchar_t *birthTimeField, + const wchar_t *modifyTimeField) +{ + if (!filter.isValid()) { + return nullptr; + } + + auto [start, end] = filter.resolveTimeRange(); + qint64 startEpoch = toEpochSecs(start); + qint64 endEpoch = toEpochSecs(end); + + if (filter.timeField() == TimeField::Both) { + // Build BooleanQuery with SHOULD for both time fields + Lucene::BooleanQueryPtr timeBoolQuery = Lucene::newLucene(); + + Lucene::QueryPtr birthQuery = buildNumericRangeQuery( + birthTimeField, startEpoch, endEpoch, + filter.includeLower(), filter.includeUpper()); + if (birthQuery) { + timeBoolQuery->add(birthQuery, Lucene::BooleanClause::SHOULD); + } + + Lucene::QueryPtr modifyQuery = buildNumericRangeQuery( + modifyTimeField, startEpoch, endEpoch, + filter.includeLower(), filter.includeUpper()); + if (modifyQuery) { + timeBoolQuery->add(modifyQuery, Lucene::BooleanClause::SHOULD); + } + + // Need at least one clause for a valid BooleanQuery + return (birthQuery || modifyQuery) ? timeBoolQuery : nullptr; + } + + // Single field query + const wchar_t *fieldName = (filter.timeField() == TimeField::BirthTime) + ? birthTimeField + : modifyTimeField; + + return buildNumericRangeQuery( + fieldName, startEpoch, endEpoch, + filter.includeLower(), filter.includeUpper()); +} + } // namespace TimeRangeUtils DFM_SEARCH_END_NS diff --git a/src/dfm-search/dfm-search-lib/utils/timerangeutils.h b/src/dfm-search/dfm-search-lib/utils/timerangeutils.h index 72e9196c..6db71022 100644 --- a/src/dfm-search/dfm-search-lib/utils/timerangeutils.h +++ b/src/dfm-search/dfm-search-lib/utils/timerangeutils.h @@ -12,6 +12,8 @@ DFM_SEARCH_BEGIN_NS +class TimeRangeFilter; + /** * @brief TimeRangeUtils provides utility functions for time range operations */ @@ -40,6 +42,25 @@ Lucene::QueryPtr buildNumericRangeQuery( bool includeLower, bool includeUpper); +/** + * @brief Build a Lucene query for time range filtering, supporting TimeField::Both + * + * When filter.timeField() is TimeField::Both, this builds a BooleanQuery with SHOULD + * clauses for both birth_time and modify_time fields. Otherwise, it builds a single + * NumericRangeQuery for the specified field. + * + * The returned query is designed to be added to an outer query with BooleanClause::MUST. + * + * @param filter The time range filter containing field selection and range + * @param birthTimeField The Lucene field name for birth time + * @param modifyTimeField The Lucene field name for modification time + * @return A Lucene query (single NumericRangeQuery or BooleanQuery for Both), or nullptr if invalid + */ +Lucene::QueryPtr buildTimeRangeFilterQuery( + const TimeRangeFilter &filter, + const wchar_t *birthTimeField, + const wchar_t *modifyTimeField); + } // namespace TimeRangeUtils DFM_SEARCH_END_NS