diff --git a/AUTHORS b/AUTHORS index fddc2d9..3652d69 100644 --- a/AUTHORS +++ b/AUTHORS @@ -25,6 +25,7 @@ The following submitted code, packages or analysis, and deserve special thanks: Tobias Predel Andrew Poelstra thaafox + Johan Sarge Thanks to the following, who submitted detailed bug reports and excellent suggestions: diff --git a/ChangeLog b/ChangeLog index 87f628d..0a2e05f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,6 @@ master/HEAD +- #117 Extended Unicode support in Composite, character display width now taken into account + (thanks to Johan Sarge) - #111 Duration: support negative durations by prefixing a '-' before the P in ISO format (thanks to Andrew Poelstra) - #113 Set CMAKE_CURRENT_SOURCE_DIR instead of CMAKE_SOURCE_DIR diff --git a/src/Composite.cpp b/src/Composite.cpp index 0a3d1b5..d8bb5f8 100644 --- a/src/Composite.cpp +++ b/src/Composite.cpp @@ -1,6 +1,6 @@ //////////////////////////////////////////////////////////////////////////////// // -// Copyright 2016 - 2021, 2023, Gothenburg Bit Factory. +// Copyright 2016 - 2021, 2023, 2026 Gothenburg Bit Factory. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -25,10 +25,111 @@ //////////////////////////////////////////////////////////////////////////////// #include +#include +#include #include -#include #include + +//////////////////////////////////////////////////////////////////////////////// + +namespace +{ + + // Helper function that either replaces a pre-existing element at index (i) in + // a std::vector with the value (x) (if (i) is less than the size of the vector) + // or extends the vector in such a way that it ends up with (i+1) elements, with + // the value (x) at index (i) and the padding value (pad) at each index between + // that of the final pre-existing element of the vector and (i). + template + void put_or_extend ( + std::vector& v, typename std::vector::size_type i, const T& x, const T& pad = T {}) + { + if (i < v.size ()) + v[i] = x; + else + { + v.resize (i, pad); + v.push_back (x); + } + } + + // Helper class that is used to store information about columns in a Composite. + struct ColumnData + { + // Number of topmost layer that overlaps with the column represented by this ColumnData. + // NOTE: Layer numbers start at 1. "Layer 0" is background not covered by any layer. + unsigned int layer_num; + + // Byte offset into the UTF-8 text string of the layer identified by (layer_num). + // Points to the first byte of the first character to include in the content + // of the column represented by this ColumnData. + std::string::size_type text_begin_i; + + // Byte offset into the UTF-8 text string of the layer identified by (layer_num). + // Points to the first byte after the last character to include in the content + // of the column represented by this ColumnData. + std::string::size_type text_end_i; + + // Unicode display width of the first character to include in the content + // of the column represented by this ColumnData. Should always be 1 or 2, + // unless this ColumnData represents a padding column. + unsigned char char_0_width; + + ColumnData ( + unsigned int layer = 0, std::string::size_type begin_i = 1, std::string::size_type end_i = 0, + unsigned char c_0_w = 0) + : + layer_num (layer), text_begin_i (begin_i), text_end_i (end_i), char_0_width (c_0_w) + {} + + ColumnData (const ColumnData& orig) = default; + + ColumnData& operator= (const ColumnData& orig) = default; + + std::string::difference_type byte_count () const + { + return text_end_i - text_begin_i; + } + + // Changes the state of this ColumnData to one that indicates that the ColumnData + // represents a padding column (i.e. a state where byte_count is negative). + void make_padding () + { + text_begin_i = 1; + text_end_i = 0; + char_0_width = 0; + } + + bool is_padding () const + { + return byte_count () < 0; + } + }; + + const ColumnData LAYER_0_PAD; // ColumnData representing a padding column on "layer 0". + + // Special column index value, distinct from any valid column index. + const std::string::size_type INVALID_COLUMN_I = std::numeric_limits::max (); + + // Helper function that turns the uncovered half of half-covered wide characters into padding. + inline void do_halfcovered_wide_char_check ( + std::vector& columns, std::vector::size_type column_i) + { + // If there is a wide character (on a lower layer) in the preceding column, replace + // that character (and any nonspacing characters associated with it) with padding. + // (Because the second half of that character will be covered, and we couldn't display + // half a character if we wanted to.) + if (column_i >= 1 && column_i - 1 < columns.size ()) + { + ColumnData& prev_col_data = columns[column_i - 1]; + if (prev_col_data.char_0_width == 2) + prev_col_data.make_padding (); + } + } + +}; + //////////////////////////////////////////////////////////////////////////////// // Initially assume no text, but infinite virtual space. // @@ -74,65 +175,114 @@ void Composite::add ( // bbbbb // Layer 2 // c // Layer 3 // -// Walk all strings left to right, selecting the character and color from the +// Walk all layers left to right, selecting the character and color from the // highest numbered layer. Emit color codes only on edge detection. // std::string Composite::str () const { - // The strings are broken into a vector of int, for UTF8 support. - std::vector characters; - std::vector colors; - for (unsigned int layer = 0; layer < _layers.size (); ++layer) + std::vector columns; + + for (unsigned int layer_i = 0; layer_i < _layers.size (); ++layer_i) { - const auto& text = std::get <0> (_layers[layer]); - auto offset = std::get <1> (_layers[layer]); - auto len = utf8_text_length (text); + const auto& text = std::get <0> (_layers[layer_i]); + auto offset = std::get <1> (_layers[layer_i]); + auto len = utf8_text_length (text); - // Make sure the vectors are large enough to support a write operator[]. - if (characters.size () < offset + len) - { - characters.resize (offset + len, 32); - colors.resize (offset + len, 0); - } + // Make sure the capacity of the column vector is large enough to support push_back() + // without reallocation. + if (columns.capacity () < offset + len) + columns.reserve (offset + len); - // Copy in the layer characters and color indexes. + // Inspect and decide how to handle each character (i.e. Unicode code point) + // in the current layer's text string. + std::string::size_type prev_cursor = 0; std::string::size_type cursor = 0; - int character; - int count = 0; + unsigned int column_count = 0; + std::string::size_type prev_spacer_column_i = INVALID_COLUMN_I; + unsigned int character; while ((character = utf8_next_char (text, cursor))) { - characters[offset + count] = character; - colors [offset + count] = layer + 1; - ++count; + std::string::size_type column_i = offset + column_count; + int ch_width = mk_wcwidth ((wchar_t)character); + + switch (ch_width) + { + case 0: // zero-width / nonspacing character + if (prev_spacer_column_i == INVALID_COLUMN_I) // No preceding spacing character on this layer. + ; // Skip this character. + else // There is a preceding spacing character on this layer. + { + // Append the nonspacing character to the column of the previous spacing character. + columns[prev_spacer_column_i].text_end_i = cursor; + } + break; + case 1: // ordinary narrow spacing character + if (prev_spacer_column_i == INVALID_COLUMN_I) + do_halfcovered_wide_char_check (columns, column_i); + + // Put the character in the appropriate column. Pad out the column list as necessary. + put_or_extend (columns, column_i, ColumnData (layer_i + 1, prev_cursor, cursor, 1), LAYER_0_PAD); + + prev_spacer_column_i = column_i; + column_count += 1; + break; + case 2: // graphically wide spacing character + if (prev_spacer_column_i == INVALID_COLUMN_I) + do_halfcovered_wide_char_check (columns, column_i); + + // Put the character in the appropriate column. Pad out the column list as necessary. + // Make the column after the current one (which is also covered by the wide character) + // a padding column on the current layer. + put_or_extend (columns, column_i, ColumnData (layer_i + 1, prev_cursor, cursor, 2), LAYER_0_PAD); + put_or_extend (columns, column_i + 1, ColumnData (layer_i + 1), LAYER_0_PAD); + + prev_spacer_column_i = column_i; + column_count += 2; + break; + default: // Should not happen. + throw format ("Unexpected character width {1} of code point 0x{2}.", ch_width, formatHex (character)); + } + + // Remember byte offset of first UTF-8 byte of next character in the layer text. + prev_cursor = cursor; } } - // Now walk the character and color vector, emitting every character and - // every detected color change. + // Now walk the column vector, emitting every character and every detected layer change. std::stringstream out; - int prev_color = 0; - for (unsigned int i = 0; i < characters.size (); ++i) + unsigned int prev_layer = 0; + for (unsigned int column_i = 0; column_i < columns.size (); ++column_i) { - // A change in color triggers a code emit. - if (prev_color != colors[i]) + auto column_data = columns[column_i]; + auto curr_layer = column_data.layer_num; + const auto& text = std::get <0> (_layers[curr_layer - 1]); + + // A change in layer triggers an ANSI escape code emit. + if (prev_layer != curr_layer) { - if (prev_color) - out << std::get <2> (_layers[prev_color - 1]).end (); + if (prev_layer) // Reset attributes (if any) of previous layer. + out << std::get <2> (_layers[prev_layer - 1]).end (); - if (colors[i]) - out << std::get <2> (_layers[colors[i] - 1]).code (); - else - out << std::get <2> (_layers[prev_color - 1]).end (); + if (curr_layer) // Set attributes (if any) of current layer. + out << std::get <2> (_layers[curr_layer - 1]).code (); - prev_color = colors[i]; + prev_layer = curr_layer; } - out << utf8_character (characters[i]); + // The layer text string is already UTF-8, so we can output its bytes verbatim, + // provided that we're keeping track of character (i.e. code point) boundaries. + if (column_data.is_padding ()) + out << ' '; // Display padding columns as spaces. + else // Display a slice of the layer text (Spacer [Nonspacer ...]). + out.write(text.data () + column_data.text_begin_i, column_data.byte_count ()); + + if (column_data.char_0_width == 2) + ++column_i; // Wide characters cover two columns. } // Terminate the color codes, if necessary. - if (prev_color) - out << std::get <2> (_layers[prev_color - 1]).end (); + if (prev_layer) + out << std::get <2> (_layers[prev_layer - 1]).end (); return out.str (); } diff --git a/test/composite.t.cpp b/test/composite.t.cpp index 1e9d815..4ed3bca 100644 --- a/test/composite.t.cpp +++ b/test/composite.t.cpp @@ -30,7 +30,7 @@ //////////////////////////////////////////////////////////////////////////////// int main (int, char**) { - UnitTest t (3); + UnitTest t (4); Composite c1; c1.add ("left", 2, Color ()); @@ -130,8 +130,52 @@ int main (int, char**) c8.add ( "foo", 7, Color ("white on red")); t.diag (c8.str ()); + // Add layers containing characters with non-standard Unicode width. + // Verify that they are composited correctly. + // * Each zero-width character should be included in the column of the + // preceding non-zero-width character on the same layer. (If there is + // no such character, the zero-width character should be skipped.) + // * Each wide character should be treated as occupying two columns of the + // layer, the one corresponding to the array index at which the character + // code is stored, and the next one. + // * If exactly one of the columns occupied by a wide characher is also + // occupied by a character in a higher layer (obscuring half of the wide + // character), then the wide character should not be displayed at all. + // The unobscured column should be treated as containing blank space + // (but still be covered by the current layer). + Composite c9; + c9.add ("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 0, Color ()); // BG + c9.add ("a", 50, Color ()); // more BG + c9.add ("😃😃😃", 1, Color ()); // some wide chars + c9.add ("bb", 1, Color ()); // obscure the first of the two wide chars + c9.add ("😖😖😖", 8, Color ()); // a few more wide chars + c9.add ("cc", 9, Color ()); // obscure half of each of the first two + c9.add ("😬😬😬", 15, Color ()); // even more + c9.add ("会会会", 18, Color ()); // obscure the last one-and-half + c9.add ("[èé][ñn̄][öô]", 25, Color ()); // layer with zero-width chars (combining diacritics) + c9.add ("}{", 32, Color ()); // obscure two of the non-zero-width chars + c9.add ("è🐋é🐋", 38, Color ()); // 1-col, 0-col and 2-col chars on same layer + c9.add ("\a\aff", 45, Color ()); // zero-width characters at beginning of layer + t.is (c9.str (), "abb😃😃a cc 😖a😬 会会会a[èé][ñn̄}{öô]aè🐋é🐋affa a", "Composite ... --> 'abb😃😃a cc 😖a😬 会会会a[èé][ñn̄}{öô]aè🐋é🐋affa a'"); + + // Add colored layers containing characters with non-standard Unicode width. + // Display the result. + Composite c10; + c10.add ("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 0, Color ("black on bright blue")); // BG + c10.add ("a", 50, Color ("black on bright blue")); // more BG + c10.add ("😃😃😃", 1, Color ("yellow on grey10")); // some wide chars + c10.add ("bb", 1, Color ("red on black")); // obscure the first of the two wide chars + c10.add ("😖😖😖", 8, Color ("green on blue")); // a few more wide chars + c10.add ("cc", 9, Color ("grey18 on green")); // obscure half of each of the first two + c10.add ("😬😬😬", 15, Color ("white on red")); // even more + c10.add ("会会会", 18, Color ("magenta on grey6")); // obscure the last one-and-half + c10.add ("[èé][ñn̄][öô]", 25, Color ("blue on white")); // layer with zero-width chars (combining diacritics) + c10.add ("}{", 32, Color ("red on white")); // obscure two of the non-zero-width chars + c10.add ("è🐋é🐋", 38, Color ("yellow on cyan")); // 1-col, 0-col and 2-col chars on same layer + c10.add ("\a\aff", 45, Color ("black on bright yellow")); // zero-width characters at beginning of layer + t.diag (c10.str ()); + return 0; } //////////////////////////////////////////////////////////////////////////////// -