diff --git a/CMakeLists.txt b/CMakeLists.txt index 1214d2af7..5572f2bf4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -143,7 +143,11 @@ set(ODR_SOURCE_FILES "src/odr/internal/odf/odf_parser.cpp" "src/odr/internal/odf/odf_style.cpp" - "src/odr/internal/oldms/word/io.cpp" + "src/odr/internal/oldms/text/doc_document.cpp" + "src/odr/internal/oldms/text/doc_element_registry.cpp" + "src/odr/internal/oldms/text/doc_helper.cpp" + "src/odr/internal/oldms/text/doc_io.cpp" + "src/odr/internal/oldms/text/doc_parser.cpp" "src/odr/internal/oldms/oldms_file.cpp" "src/odr/internal/ooxml/presentation/ooxml_presentation_document.cpp" diff --git a/src/odr/internal/odf/odf_element_registry.cpp b/src/odr/internal/odf/odf_element_registry.cpp index cb6f01e3c..2429ea056 100644 --- a/src/odr/internal/odf/odf_element_registry.cpp +++ b/src/odr/internal/odf/odf_element_registry.cpp @@ -8,9 +8,10 @@ namespace odr::internal::odf { void ElementRegistry::clear() noexcept { m_elements.clear(); - m_tables.clear(); m_texts.clear(); + m_tables.clear(); m_sheets.clear(); + m_sheet_cells.clear(); } [[nodiscard]] std::size_t ElementRegistry::size() const noexcept { @@ -219,17 +220,17 @@ void ElementRegistry::check_element_id(const ElementIdentifier id) const { } } -void ElementRegistry::check_table_id(const ElementIdentifier id) const { +void ElementRegistry::check_text_id(const ElementIdentifier id) const { check_element_id(id); - if (!m_tables.contains(id)) { + if (!m_texts.contains(id)) { throw std::out_of_range( "DocumentElementRegistry::check_id: identifier not found"); } } -void ElementRegistry::check_text_id(const ElementIdentifier id) const { +void ElementRegistry::check_table_id(const ElementIdentifier id) const { check_element_id(id); - if (!m_texts.contains(id)) { + if (!m_tables.contains(id)) { throw std::out_of_range( "DocumentElementRegistry::check_id: identifier not found"); } diff --git a/src/odr/internal/odf/odf_element_registry.hpp b/src/odr/internal/odf/odf_element_registry.hpp index 60bbf466a..ceea7ea38 100644 --- a/src/odr/internal/odf/odf_element_registry.hpp +++ b/src/odr/internal/odf/odf_element_registry.hpp @@ -125,8 +125,8 @@ class ElementRegistry final { std::unordered_map m_sheet_cells; void check_element_id(ElementIdentifier id) const; - void check_table_id(ElementIdentifier id) const; void check_text_id(ElementIdentifier id) const; + void check_table_id(ElementIdentifier id) const; void check_sheet_id(ElementIdentifier id) const; void check_sheet_cell_id(ElementIdentifier id) const; }; diff --git a/src/odr/internal/oldms/text/doc_document.cpp b/src/odr/internal/oldms/text/doc_document.cpp new file mode 100644 index 000000000..9e73e545f --- /dev/null +++ b/src/odr/internal/oldms/text/doc_document.cpp @@ -0,0 +1,199 @@ +#include + +#include +#include +#include + +#include +#include +#include +#include + +namespace odr::internal::oldms::text { + +namespace { +std::unique_ptr +create_element_adapter(const Document &document, ElementRegistry ®istry); +} + +Document::Document(std::shared_ptr files) + : internal::Document(FileType::legacy_word_document, DocumentType::text, + std::move(files)) { + m_root_element = parse_tree(m_element_registry, *files); + + m_element_adapter = create_element_adapter(*this, m_element_registry); +} + +ElementRegistry &Document::element_registry() { return m_element_registry; } + +const ElementRegistry &Document::element_registry() const { + return m_element_registry; +} + +bool Document::is_editable() const noexcept { return true; } + +bool Document::is_savable(const bool encrypted) const noexcept { + return !encrypted; +} + +void Document::save(const Path &path) const { + (void)path; + throw UnsupportedOperation(); +} + +void Document::save(const Path &path, const char *password) const { + (void)path; + (void)password; + throw UnsupportedOperation(); +} + +namespace { + +class ElementAdapter final : public abstract::ElementAdapter, + public abstract::TextRootAdapter, + public abstract::LineBreakAdapter, + public abstract::ParagraphAdapter, + public abstract::SpanAdapter, + public abstract::TextAdapter { +public: + ElementAdapter(const Document &document, ElementRegistry ®istry) + : m_document(&document), m_registry(®istry) {} + + [[nodiscard]] ElementType + element_type(const ElementIdentifier element_id) const override { + return m_registry->element_at(element_id).type; + } + + [[nodiscard]] ElementIdentifier + element_parent(const ElementIdentifier element_id) const override { + return m_registry->element_at(element_id).parent_id; + } + [[nodiscard]] ElementIdentifier + element_first_child(const ElementIdentifier element_id) const override { + return m_registry->element_at(element_id).first_child_id; + } + [[nodiscard]] ElementIdentifier + element_last_child(const ElementIdentifier element_id) const override { + return m_registry->element_at(element_id).last_child_id; + } + [[nodiscard]] ElementIdentifier + element_previous_sibling(const ElementIdentifier element_id) const override { + return m_registry->element_at(element_id).previous_sibling_id; + } + [[nodiscard]] ElementIdentifier + element_next_sibling(const ElementIdentifier element_id) const override { + return m_registry->element_at(element_id).next_sibling_id; + } + + [[nodiscard]] bool element_is_unique( + [[maybe_unused]] const ElementIdentifier element_id) const override { + (void)element_id; + return true; + } + [[nodiscard]] bool element_is_self_locatable( + [[maybe_unused]] const ElementIdentifier element_id) const override { + (void)element_id; + return true; + } + [[nodiscard]] bool element_is_editable( + [[maybe_unused]] const ElementIdentifier element_id) const override { + (void)element_id; + return false; + } + [[nodiscard]] + DocumentPath + element_document_path(const ElementIdentifier element_id) const override { + return util::document::extract_path(*this, element_id, null_element_id); + } + [[nodiscard]] ElementIdentifier + element_navigate_path(const ElementIdentifier element_id, + const DocumentPath &path) const override { + return util::document::navigate_path(*this, element_id, path); + } + + [[nodiscard]] const TextRootAdapter * + text_root_adapter(const ElementIdentifier element_id) const override { + return element_type(element_id) == ElementType::root ? this : nullptr; + } + [[nodiscard]] const LineBreakAdapter * + line_break_adapter(const ElementIdentifier element_id) const override { + return element_type(element_id) == ElementType::line_break ? this : nullptr; + } + [[nodiscard]] const ParagraphAdapter * + paragraph_adapter(const ElementIdentifier element_id) const override { + return element_type(element_id) == ElementType::paragraph ? this : nullptr; + } + [[nodiscard]] const SpanAdapter * + span_adapter(const ElementIdentifier element_id) const override { + return element_type(element_id) == ElementType::span ? this : nullptr; + } + [[nodiscard]] const TextAdapter * + text_adapter(const ElementIdentifier element_id) const override { + return element_type(element_id) == ElementType::text ? this : nullptr; + } + + [[nodiscard]] PageLayout text_root_page_layout( + [[maybe_unused]] const ElementIdentifier element_id) const override { + (void)element_id; + return {}; + } + [[nodiscard]] ElementIdentifier text_root_first_master_page( + [[maybe_unused]] const ElementIdentifier element_id) const override { + (void)element_id; + return {}; + } + + [[nodiscard]] TextStyle + line_break_style(const ElementIdentifier element_id) const override { + (void)element_id; + return {}; // TODO + } + + [[nodiscard]] ParagraphStyle + paragraph_style(const ElementIdentifier element_id) const override { + (void)element_id; + return {}; // TODO + } + [[nodiscard]] TextStyle + paragraph_text_style(const ElementIdentifier element_id) const override { + (void)element_id; + return {}; // TODO + } + + [[nodiscard]] TextStyle + span_style(const ElementIdentifier element_id) const override { + (void)element_id; + return {}; // TODO + } + + [[nodiscard]] std::string + text_content(const ElementIdentifier element_id) const override { + return m_registry->text_element_at(element_id).text; + } + void text_set_content(const ElementIdentifier element_id, + const std::string &text) const override { + (void)element_id; + (void)text; + throw UnsupportedOperation(); + } + [[nodiscard]] TextStyle + text_style(const ElementIdentifier element_id) const override { + (void)element_id; + return {}; // TODO + } + +private: + // TODO remove maybe_unused + [[maybe_unused]] + const Document *m_document{nullptr}; + ElementRegistry *m_registry{nullptr}; +}; + +std::unique_ptr +create_element_adapter(const Document &document, ElementRegistry ®istry) { + return std::make_unique(document, registry); +} + +} // namespace + +} // namespace odr::internal::oldms::text diff --git a/src/odr/internal/oldms/text/doc_document.hpp b/src/odr/internal/oldms/text/doc_document.hpp new file mode 100644 index 000000000..c88127066 --- /dev/null +++ b/src/odr/internal/oldms/text/doc_document.hpp @@ -0,0 +1,28 @@ +#pragma once + +#include +#include + +#include + +namespace odr::internal::oldms::text { + +class Document final : public internal::Document { +public: + explicit Document(std::shared_ptr files); + + ElementRegistry &element_registry(); + + [[nodiscard]] const ElementRegistry &element_registry() const; + + [[nodiscard]] bool is_editable() const noexcept override; + [[nodiscard]] bool is_savable(bool encrypted) const noexcept override; + + void save(const Path &path) const override; + void save(const Path &path, const char *password) const override; + +private: + ElementRegistry m_element_registry; +}; + +} // namespace odr::internal::oldms::text diff --git a/src/odr/internal/oldms/text/doc_element_registry.cpp b/src/odr/internal/oldms/text/doc_element_registry.cpp new file mode 100644 index 000000000..772fd677f --- /dev/null +++ b/src/odr/internal/oldms/text/doc_element_registry.cpp @@ -0,0 +1,98 @@ +#include + +#include + +namespace odr::internal::oldms::text { + +void ElementRegistry::clear() noexcept { + m_elements.clear(); + m_texts.clear(); +} + +[[nodiscard]] std::size_t ElementRegistry::size() const noexcept { + return m_elements.size(); +} + +std::tuple +ElementRegistry::create_element(const ElementType type) { + Element &element = m_elements.emplace_back(); + ElementIdentifier element_id = m_elements.size(); + element.type = type; + return {element_id, element}; +} + +std::tuple +ElementRegistry::create_text_element() { + const auto &[element_id, element] = create_element(ElementType::text); + auto [it, success] = m_texts.emplace(element_id, Text{}); + return {element_id, element, it->second}; +} + +ElementRegistry::Element & +ElementRegistry::element_at(const ElementIdentifier id) { + check_element_id(id); + return m_elements.at(id - 1); +} + +ElementRegistry::Text & +ElementRegistry::text_element_at(const ElementIdentifier id) { + check_text_id(id); + return m_texts.at(id); +} + +const ElementRegistry::Element & +ElementRegistry::element_at(const ElementIdentifier id) const { + check_element_id(id); + return m_elements.at(id - 1); +} + +const ElementRegistry::Text & +ElementRegistry::text_element_at(const ElementIdentifier id) const { + check_text_id(id); + return m_texts.at(id); +} + +void ElementRegistry::append_child(const ElementIdentifier parent_id, + const ElementIdentifier child_id) { + check_element_id(parent_id); + check_element_id(child_id); + if (element_at(child_id).parent_id != null_element_id) { + throw std::invalid_argument( + "DocumentElementRegistry::append_child: child already has a parent"); + } + + const ElementIdentifier previous_sibling_id = + element_at(parent_id).last_child_id; + + element_at(child_id).parent_id = parent_id; + element_at(child_id).previous_sibling_id = previous_sibling_id; + + if (element_at(parent_id).first_child_id == null_element_id) { + element_at(parent_id).first_child_id = child_id; + } else { + element_at(previous_sibling_id).next_sibling_id = child_id; + } + element_at(parent_id).last_child_id = child_id; +} + +void ElementRegistry::check_element_id(const ElementIdentifier id) const { + if (id == null_element_id) { + throw std::out_of_range( + "DocumentElementRegistry::check_id: null identifier"); + } + if (id - 1 >= m_elements.size()) { + throw std::out_of_range( + "DocumentElementRegistry::check_id: identifier out of range"); + } +} + +void ElementRegistry::check_text_id(const ElementIdentifier id) const { + check_element_id(id); + if (!m_texts.contains(id)) { + throw std::out_of_range( + "DocumentElementRegistry::check_id: identifier not found"); + } +} + +} // namespace odr::internal::oldms::text diff --git a/src/odr/internal/oldms/text/doc_element_registry.hpp b/src/odr/internal/oldms/text/doc_element_registry.hpp new file mode 100644 index 000000000..41bcaea4f --- /dev/null +++ b/src/odr/internal/oldms/text/doc_element_registry.hpp @@ -0,0 +1,50 @@ +#pragma once + +#include +#include + +#include +#include +#include + +namespace odr::internal::oldms::text { + +class ElementRegistry final { +public: + struct Element final { + ElementIdentifier parent_id{null_element_id}; + ElementIdentifier first_child_id{null_element_id}; + ElementIdentifier last_child_id{null_element_id}; + ElementIdentifier previous_sibling_id{null_element_id}; + ElementIdentifier next_sibling_id{null_element_id}; + ElementType type{ElementType::none}; + }; + + struct Text final { + std::string text; + }; + + void clear() noexcept; + + [[nodiscard]] std::size_t size() const noexcept; + + std::tuple create_element(ElementType type); + std::tuple create_text_element(); + + [[nodiscard]] Element &element_at(ElementIdentifier id); + [[nodiscard]] Text &text_element_at(ElementIdentifier id); + + [[nodiscard]] const Element &element_at(ElementIdentifier id) const; + [[nodiscard]] const Text &text_element_at(ElementIdentifier id) const; + + void append_child(ElementIdentifier parent_id, ElementIdentifier child_id); + +private: + std::vector m_elements; + std::unordered_map m_texts; + + void check_element_id(ElementIdentifier id) const; + void check_text_id(ElementIdentifier id) const; +}; + +} // namespace odr::internal::oldms::text diff --git a/src/odr/internal/oldms/text/doc_helper.cpp b/src/odr/internal/oldms/text/doc_helper.cpp new file mode 100644 index 000000000..0cd437b1b --- /dev/null +++ b/src/odr/internal/oldms/text/doc_helper.cpp @@ -0,0 +1,35 @@ +#include + +#include +#include +#include + +#include + +namespace odr::internal::oldms { + +text::CharacterIndex text::read_character_index(std::istream &in) { + CharacterIndex result; + + read_Clx(in, skip_Prc, [&](std::istream &) { + if (const int c = in.get(); c != 0x2) { + throw std::runtime_error("Unexpected input: " + std::to_string(c)); + } + const std::uint32_t lcb = util::byte_stream::read(in); + std::string plcPcd = util::stream::read(in, lcb); + const PlcPcdMap plc_pcd_map(plcPcd.data(), plcPcd.size()); + + for (std::uint32_t i = 0; i < plc_pcd_map.n(); ++i) { + const bool is_compressed = plc_pcd_map.aData(i).fc.fCompressed != 0; + const std::size_t data_offset = is_compressed + ? plc_pcd_map.aData(i).fc.fc / 2 + : plc_pcd_map.aData(i).fc.fc; + const std::size_t length_cp = plc_pcd_map.aCP(i + 1) - plc_pcd_map.aCP(i); + result.append(plc_pcd_map.aCP(i), length_cp, data_offset, is_compressed); + } + }); + + return result; +} + +} // namespace odr::internal::oldms diff --git a/src/odr/internal/oldms/text/doc_helper.hpp b/src/odr/internal/oldms/text/doc_helper.hpp new file mode 100644 index 000000000..62e346cda --- /dev/null +++ b/src/odr/internal/oldms/text/doc_helper.hpp @@ -0,0 +1,109 @@ +#pragma once + +#include +#include +#include +#include +#include + +namespace odr::internal::oldms::text { + +class CharacterIndex { +public: + [[nodiscard]] bool empty() const { return m_entries.empty(); } + [[nodiscard]] std::size_t size() const { return m_entries.size(); } + + [[nodiscard]] std::size_t last_cp() const { + if (m_entries.empty()) { + return 0; + } + return m_entries.back().end_cp; + } + + void append(const std::size_t start_cp, const std::size_t length_cp, + const std::size_t data_offset, const bool is_compressed) { + const std::size_t end_cp = start_cp + length_cp; + if (end_cp < last_cp()) { + throw std::runtime_error( + "append must be used in order of increasing start_cp"); + } + m_entries.emplace_back(end_cp, data_offset, is_compressed); + } + + struct Entry { + std::size_t start_cp; + std::size_t length_cp; + std::size_t data_offset; + std::size_t data_length; + bool is_compressed; + }; + +private: + struct InternalEntry; + +public: + struct Iterator { + Iterator(const CharacterIndex &parent, const std::size_t index) + : m_parent(&parent), m_index(index) {} + + Entry operator*() const { + const std::size_t start_cp = m_index == 0 ? 0 : prev().end_cp; + const std::size_t length_cp = entry().end_cp - start_cp; + const std::size_t data_length = + length_cp * (entry().is_compressed ? 1 : 2); + + return Entry{start_cp, length_cp, entry().data_offset, data_length, + entry().is_compressed}; + } + + Iterator &operator++() { + ++m_index; + return *this; + } + + Iterator operator++(int) { + const Iterator tmp = *this; + ++*this; + return tmp; + } + + bool operator==(const Iterator &other) const { + return m_index == other.m_index; + } + + private: + const CharacterIndex *m_parent{nullptr}; + std::size_t m_index{0}; + + [[nodiscard]] const InternalEntry &entry() const { + return m_parent->m_entries[m_index]; + } + [[nodiscard]] const InternalEntry &prev() const { + return m_parent->m_entries[m_index - 1]; + } + }; + + [[nodiscard]] Iterator begin() const { return {*this, 0}; } + [[nodiscard]] Iterator end() const { return {*this, m_entries.size()}; } + [[nodiscard]] Iterator find(const std::size_t cp) const { + const auto it = + std::ranges::lower_bound(m_entries, cp, {}, &InternalEntry::end_cp); + if (it == m_entries.end()) { + return end(); + } + return {*this, static_cast(it - m_entries.begin())}; + } + +private: + struct InternalEntry { + std::size_t end_cp; + std::size_t data_offset{}; + bool is_compressed{}; + }; + + std::vector m_entries; +}; + +CharacterIndex read_character_index(std::istream &in); + +} // namespace odr::internal::oldms::text diff --git a/src/odr/internal/oldms/word/io.cpp b/src/odr/internal/oldms/text/doc_io.cpp similarity index 83% rename from src/odr/internal/oldms/word/io.cpp rename to src/odr/internal/oldms/text/doc_io.cpp index 7d476e8d2..7567654f1 100644 --- a/src/odr/internal/oldms/word/io.cpp +++ b/src/odr/internal/oldms/text/doc_io.cpp @@ -1,11 +1,11 @@ -#include +#include #include "odr/internal/util/string_util.hpp" #include #include -namespace odr::internal::oldms { +namespace odr::internal::oldms::text { namespace { @@ -38,35 +38,35 @@ auto type_dispatch_FibRgFcLcb(const std::uint16_t nFib, const F &f) { } // namespace -} // namespace odr::internal::oldms +} // namespace odr::internal::oldms::text -namespace odr::internal { +namespace odr::internal::oldms { -void oldms::read(std::istream &in, FibBase &out) { +void text::read(std::istream &in, FibBase &out) { util::byte_stream::read(in, out); } -void oldms::read(std::istream &in, FibRgFcLcb97 &out) { +void text::read(std::istream &in, FibRgFcLcb97 &out) { util::byte_stream::read(in, out); } -void oldms::read(std::istream &in, FibRgFcLcb2000 &out) { +void text::read(std::istream &in, FibRgFcLcb2000 &out) { util::byte_stream::read(in, out); } -void oldms::read(std::istream &in, FibRgFcLcb2002 &out) { +void text::read(std::istream &in, FibRgFcLcb2002 &out) { util::byte_stream::read(in, out); } -void oldms::read(std::istream &in, FibRgFcLcb2003 &out) { +void text::read(std::istream &in, FibRgFcLcb2003 &out) { util::byte_stream::read(in, out); } -void oldms::read(std::istream &in, FibRgFcLcb2007 &out) { +void text::read(std::istream &in, FibRgFcLcb2007 &out) { util::byte_stream::read(in, out); } -std::size_t oldms::determine_size_Fib(std::istream &in) { +std::size_t text::determine_size_Fib(std::istream &in) { std::size_t result = 0; const auto read_uint16_t = [&] { @@ -92,7 +92,7 @@ std::size_t oldms::determine_size_Fib(std::istream &in) { return result; } -void oldms::read(std::istream &in, ParsedFib &out) { +void text::read(std::istream &in, ParsedFib &out) { read(in, out.base); util::byte_stream::read(in, out.csw); @@ -133,7 +133,7 @@ void oldms::read(std::istream &in, ParsedFib &out) { }); } -void oldms::read(std::istream &in, ParsedFibRgCswNew &out) { +void text::read(std::istream &in, ParsedFibRgCswNew &out) { util::byte_stream::read(in, out.nFibNew); switch (out.nFibNew) { @@ -157,8 +157,8 @@ void oldms::read(std::istream &in, ParsedFibRgCswNew &out) { } } -std::unique_ptr -oldms::read_FibRgFcLcb(std::istream &in, const std::uint16_t nFib) { +std::unique_ptr +text::read_FibRgFcLcb(std::istream &in, const std::uint16_t nFib) { return type_dispatch_FibRgFcLcb( nFib, [&in](const T) -> std::unique_ptr { using FibRgFcLcbType = T::type; @@ -168,8 +168,8 @@ oldms::read_FibRgFcLcb(std::istream &in, const std::uint16_t nFib) { }); } -void oldms::read_Clx(std::istream &in, const HandlePrc &handle_Prc, - const HandlePcdt &handle_Pcdt) { +void text::read_Clx(std::istream &in, const HandlePrc &handle_Prc, + const HandlePcdt &handle_Pcdt) { while (true) { const int c = in.peek(); if (c == 0x2) { @@ -183,7 +183,7 @@ void oldms::read_Clx(std::istream &in, const HandlePrc &handle_Prc, } } -void oldms::skip_Prc(std::istream &in) { +void text::skip_Prc(std::istream &in) { if (const int c = in.get(); c != 0x1) { throw std::runtime_error("Unexpected input: " + std::to_string(c)); } @@ -192,8 +192,8 @@ void oldms::skip_Prc(std::istream &in) { in.ignore(cbGrpprl); } -std::string oldms::read_string_compressed(std::istream &in, - const std::size_t size) { +std::string text::read_string_compressed(std::istream &in, + const std::size_t size) { static constexpr auto eof = std::istream::traits_type::eof(); std::string result; @@ -219,8 +219,8 @@ std::string oldms::read_string_compressed(std::istream &in, return result; } -std::u16string oldms::read_string_uncompressed(std::istream &in, - const std::size_t size) { +std::u16string text::read_string_uncompressed(std::istream &in, + const std::size_t size) { std::u16string result; result.resize(size); @@ -230,7 +230,7 @@ std::u16string oldms::read_string_uncompressed(std::istream &in, return result; } -std::optional oldms::uncompress_char(const char c) { +std::optional text::uncompress_char(const char c) { switch (c) { case '\x82': return 0x201A; @@ -285,4 +285,4 @@ std::optional oldms::uncompress_char(const char c) { } } -} // namespace odr::internal +} // namespace odr::internal::oldms diff --git a/src/odr/internal/oldms/word/io.hpp b/src/odr/internal/oldms/text/doc_io.hpp similarity index 89% rename from src/odr/internal/oldms/word/io.hpp rename to src/odr/internal/oldms/text/doc_io.hpp index 9677fbe43..6369c5735 100644 --- a/src/odr/internal/oldms/word/io.hpp +++ b/src/odr/internal/oldms/text/doc_io.hpp @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include @@ -8,7 +8,7 @@ #include #include -namespace odr::internal::oldms { +namespace odr::internal::oldms::text { void read(std::istream &in, FibBase &out); void read(std::istream &in, FibRgFcLcb97 &out); @@ -36,4 +36,4 @@ std::u16string read_string_uncompressed(std::istream &in, std::size_t size); std::optional uncompress_char(char c); -} // namespace odr::internal::oldms +} // namespace odr::internal::oldms::text diff --git a/src/odr/internal/oldms/text/doc_parser.cpp b/src/odr/internal/oldms/text/doc_parser.cpp new file mode 100644 index 000000000..f861243fe --- /dev/null +++ b/src/odr/internal/oldms/text/doc_parser.cpp @@ -0,0 +1,53 @@ +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace odr::internal::oldms { + +ElementIdentifier text::parse_tree(ElementRegistry ®istry, + const abstract::ReadableFilesystem &files) { + auto [root_id, _] = registry.create_element(ElementType::root); + + const std::string word_document = + util::stream::read(*files.open(AbsPath("/WordDocument"))->stream()); + + const auto stream = files.open(AbsPath("/WordDocument"))->stream(); + ParsedFib fib; + read(*stream, fib); + + const std::string tableStreamPath = + fib.base.fWhichTblStm == 1 ? "/1Table" : "/0Table"; + + const std::string table = + util::stream::read(*files.open(AbsPath(tableStreamPath))->stream()); + + const auto table_stream = files.open(AbsPath(tableStreamPath))->stream(); + table_stream->ignore(fib.fibRgFcLcb->clx.fc); + const CharacterIndex character_index = read_character_index(*table_stream); + for (const auto &entry : character_index) { + const auto document_stream = files.open(AbsPath("/WordDocument"))->stream(); + document_stream->seekg(entry.data_offset); + const std::string text = + read_string_compressed(*document_stream, entry.data_length); + + auto [text_id, _, text_element] = registry.create_text_element(); + text_element.text = text; + + registry.append_child(root_id, text_id); + } + + return root_id; +} + +} // namespace odr::internal::oldms diff --git a/src/odr/internal/oldms/text/doc_parser.hpp b/src/odr/internal/oldms/text/doc_parser.hpp new file mode 100644 index 000000000..d510d6b40 --- /dev/null +++ b/src/odr/internal/oldms/text/doc_parser.hpp @@ -0,0 +1,15 @@ +#pragma once + +#include + +namespace odr::internal::abstract { +class ReadableFilesystem; +} + +namespace odr::internal::oldms::text { +class ElementRegistry; + +ElementIdentifier parse_tree(ElementRegistry ®istry, + const abstract::ReadableFilesystem &files); + +} // namespace odr::internal::oldms::text diff --git a/src/odr/internal/oldms/word/structs.hpp b/src/odr/internal/oldms/text/doc_structs.hpp similarity index 99% rename from src/odr/internal/oldms/word/structs.hpp rename to src/odr/internal/oldms/text/doc_structs.hpp index 84e8cbf19..889524d62 100644 --- a/src/odr/internal/oldms/word/structs.hpp +++ b/src/odr/internal/oldms/text/doc_structs.hpp @@ -6,7 +6,7 @@ #include #include -namespace odr::internal::oldms { +namespace odr::internal::oldms::text { #pragma pack(push, 1) @@ -392,4 +392,4 @@ class PlcPcdMap : public PlcPcdBase { std::size_t m_cbPlc{0}; }; -} // namespace odr::internal::oldms +} // namespace odr::internal::oldms::text diff --git a/test/src/internal/oldms/oldms_test.cpp b/test/src/internal/oldms/oldms_test.cpp index aeb9bb054..2a01ee6d7 100644 --- a/test/src/internal/oldms/oldms_test.cpp +++ b/test/src/internal/oldms/oldms_test.cpp @@ -7,7 +7,8 @@ #include #include -#include +#include +#include #include #include @@ -31,13 +32,13 @@ TEST(OldMs, test) { internal::util::stream::read(*files.open("/WordDocument").stream()); std::cout << "/WordDocument size " << word_document.size() << std::endl; - const std::size_t fib_size = internal::oldms::determine_size_Fib( + const std::size_t fib_size = internal::oldms::text::determine_size_Fib( *files.open("/WordDocument").stream()); std::cout << "Fib size " << fib_size << std::endl; const auto stream = files.open("/WordDocument").stream(); - internal::oldms::ParsedFib fib; - internal::oldms::read(*stream, fib); + internal::oldms::text::ParsedFib fib; + internal::oldms::text::read(*stream, fib); const std::string tableStreamPath = fib.base.fWhichTblStm == 1 ? "/1Table" : "/0Table"; @@ -51,35 +52,13 @@ TEST(OldMs, test) { std::cout << "Fib.fibRgFcLcb->clx.lcb " << fib.fibRgFcLcb->clx.lcb << std::endl; table_stream->ignore(fib.fibRgFcLcb->clx.fc); - internal::oldms::read_Clx( - *table_stream, internal::oldms::skip_Prc, [&](std::istream &in) { - if (const int c = in.get(); c != 0x2) { - throw std::runtime_error("Unexpected input: " + std::to_string(c)); - } - const std::uint32_t lcb = - internal::util::byte_stream::read(in); - std::cout << "lcb " << lcb << std::endl; - std::string plcPcd = internal::util::stream::read(in, lcb); - const internal::oldms::PlcPcdMap plc_pcd_map(plcPcd.data(), - plcPcd.size()); - std::cout << "plc_pcd_map n " << plc_pcd_map.n() << std::endl; - std::cout << "plc_pcd_map aCP(0) " << plc_pcd_map.aCP(0) << std::endl; - std::cout << "plc_pcd_map aCP(1) " << plc_pcd_map.aCP(1) << std::endl; - std::cout << "plc_pcd_map aData(0).fc.fc " << plc_pcd_map.aData(0).fc.fc - << std::endl; - std::cout << "plc_pcd_map aData(0).fc.fCompressed " - << plc_pcd_map.aData(0).fc.fCompressed << std::endl; - - const std::size_t first_text_offset = plc_pcd_map.aData(0).fc.fc / 2; - const std::size_t first_text_length = - plc_pcd_map.aCP(1) - plc_pcd_map.aCP(0); - std::cout << "first_text_length " << first_text_length << std::endl; - std::cout << "first_text_offset " << first_text_offset << std::endl; - - const auto document_stream = files.open("/WordDocument").stream(); - document_stream->seekg(first_text_offset); - const std::string first_text = internal::oldms::read_string_compressed( - *document_stream, first_text_length); - std::cout << "first_text " << first_text << std::endl; - }); + const internal::oldms::text::CharacterIndex character_index = + internal::oldms::text::read_character_index(*table_stream); + for (const auto &entry : character_index) { + const auto document_stream = files.open("/WordDocument").stream(); + document_stream->seekg(entry.data_offset); + const std::string text = internal::oldms::text::read_string_compressed( + *document_stream, entry.data_length); + std::cout << "text " << text << std::endl; + } }