Skip to content

Commit 78e8680

Browse files
authored
oldms: string reading utility (#503)
1 parent 184754c commit 78e8680

5 files changed

Lines changed: 109 additions & 4 deletions

File tree

src/odr/internal/oldms/word/io.cpp

Lines changed: 96 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
#include <odr/internal/oldms/word/io.hpp>
22

3+
#include "odr/internal/util/string_util.hpp"
4+
35
#include <odr/internal/util/byte_stream_util.hpp>
4-
#include <odr/internal/util/stream_util.hpp>
6+
#include <odr/internal/util/string_util.hpp>
57

68
namespace odr::internal::oldms {
79

@@ -190,4 +192,97 @@ void oldms::skip_Prc(std::istream &in) {
190192
in.ignore(cbGrpprl);
191193
}
192194

195+
std::string oldms::read_string_compressed(std::istream &in,
196+
const std::size_t size) {
197+
static constexpr auto eof = std::istream::traits_type::eof();
198+
199+
std::string result;
200+
result.reserve(size);
201+
202+
for (std::size_t i = 0; i < size; ++i) {
203+
const auto ci = in.get();
204+
if (ci == eof) {
205+
throw std::runtime_error("Unexpected end of input");
206+
}
207+
if (ci < 0 || ci > 0xFF) {
208+
throw std::runtime_error("Unexpected input: " + std::to_string(ci));
209+
}
210+
const char c = static_cast<char>(ci);
211+
if (const std::optional<char16_t> uncompressed = uncompress_char(c);
212+
uncompressed.has_value()) {
213+
util::string::append_c32(*uncompressed, result);
214+
} else {
215+
result.push_back(c);
216+
}
217+
}
218+
219+
return result;
220+
}
221+
222+
std::u16string oldms::read_string_uncompressed(std::istream &in,
223+
const std::size_t size) {
224+
std::u16string result;
225+
result.resize(size);
226+
227+
in.read(reinterpret_cast<char *>(result.data()),
228+
static_cast<std::streamsize>(size * sizeof(char16_t)));
229+
230+
return result;
231+
}
232+
233+
std::optional<char16_t> oldms::uncompress_char(const char c) {
234+
switch (c) {
235+
case '\x82':
236+
return 0x201A;
237+
case '\x83':
238+
return 0x0192;
239+
case '\x84':
240+
return 0x201E;
241+
case '\x85':
242+
return 0x2026;
243+
case '\x86':
244+
return 0x2020;
245+
case '\x87':
246+
return 0x2021;
247+
case '\x88':
248+
return 0x02C6;
249+
case '\x89':
250+
return 0x2030;
251+
case '\x8A':
252+
return 0x0160;
253+
case '\x8B':
254+
return 0x2039;
255+
case '\x8C':
256+
return 0x0152;
257+
case '\x91':
258+
return 0x2018;
259+
case '\x92':
260+
return 0x2019;
261+
case '\x93':
262+
return 0x201C;
263+
case '\x94':
264+
return 0x201D;
265+
case '\x95':
266+
return 0x2022;
267+
case '\x96':
268+
return 0x2013;
269+
case '\x97':
270+
return 0x2014;
271+
case '\x98':
272+
return 0x02DC;
273+
case '\x99':
274+
return 0x2122;
275+
case '\x9A':
276+
return 0x0161;
277+
case '\x9B':
278+
return 0x203A;
279+
case '\x9C':
280+
return 0x0153;
281+
case '\x9F':
282+
return 0x0178;
283+
default:
284+
return std::nullopt;
285+
}
286+
}
287+
193288
} // namespace odr::internal

src/odr/internal/oldms/word/io.hpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#include <functional>
77
#include <iosfwd>
88
#include <memory>
9+
#include <optional>
910

1011
namespace odr::internal::oldms {
1112

@@ -30,4 +31,9 @@ void read_Clx(std::istream &in, const HandlePrc &handle_Prc,
3031
const HandlePcdt &handle_Pcdt);
3132
void skip_Prc(std::istream &in);
3233

34+
std::string read_string_compressed(std::istream &in, std::size_t size);
35+
std::u16string read_string_uncompressed(std::istream &in, std::size_t size);
36+
37+
std::optional<char16_t> uncompress_char(char c);
38+
3339
} // namespace odr::internal::oldms

src/odr/internal/util/string_util.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
#include <algorithm>
44
#include <cstdint>
55
#include <iomanip>
6-
#include <locale>
76
#include <sstream>
87

98
#include <utf8cpp/utf8/cpp17.h>
@@ -85,4 +84,8 @@ std::string string::c16str_to_string(const char16_t *c16str,
8584
return u16string_to_string(std::u16string(c16str, length / 2));
8685
}
8786

87+
void string::append_c32(const char32_t c, std::string &string) {
88+
utf8::append(c, string);
89+
}
90+
8891
} // namespace odr::internal::util

src/odr/internal/util/string_util.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,5 +26,6 @@ std::string to_string(double d, int precision);
2626
std::string u16string_to_string(const std::u16string &string);
2727
std::u16string string_to_u16string(const std::string &string);
2828
std::string c16str_to_string(const char16_t *c16str, std::size_t length);
29+
void append_c32(char32_t c, std::string &string);
2930

3031
} // namespace odr::internal::util::string

test/src/internal/oldms/oldms_test.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,8 +78,8 @@ TEST(OldMs, test) {
7878

7979
const auto document_stream = files.open("/WordDocument").stream();
8080
document_stream->seekg(first_text_offset);
81-
const std::string first_text =
82-
internal::util::stream::read(*document_stream, first_text_length);
81+
const std::string first_text = internal::oldms::read_string_compressed(
82+
*document_stream, first_text_length);
8383
std::cout << "first_text " << first_text << std::endl;
8484
});
8585
}

0 commit comments

Comments
 (0)