|
| 1 | +// Copyright 2012-2025 The Rust Project Developers. See the COPYRIGHT |
| 2 | +// file at the top-level directory of this distribution and at |
| 3 | +// http://rust-lang.org/COPYRIGHT. |
| 4 | +// |
| 5 | +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
| 6 | +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| 7 | +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
| 8 | +// option. This file may not be copied, modified, or distributed |
| 9 | +// except according to those terms. |
| 10 | + |
| 11 | +// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly |
| 12 | +use crate::tables::*; |
| 13 | +use crate::width_info::WidthInfo; |
| 14 | + |
| 15 | +/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c` by |
| 16 | +/// consulting a multi-level lookup table. |
| 17 | +/// |
| 18 | +/// # Maintenance |
| 19 | +/// The tables themselves are autogenerated but this function is hardcoded. You should have |
| 20 | +/// nothing to worry about if you re-run `unicode.py` (for example, when updating Unicode.) |
| 21 | +/// However, if you change the *actual structure* of the lookup tables (perhaps by editing the |
| 22 | +/// `make_tables` function in `unicode.py`) you must ensure that this code reflects those changes. |
| 23 | +#[inline] |
| 24 | +pub(crate) fn lookup_width(c: char) -> (u8, WidthInfo) { |
| 25 | + let cp = c as usize; |
| 26 | + |
| 27 | + let t1_offset = WIDTH_ROOT.0[cp >> 13]; |
| 28 | + |
| 29 | + // Each sub-table in WIDTH_MIDDLE is 7 bits, and each stored entry is a byte, |
| 30 | + // so each sub-table is 128 bytes in size. |
| 31 | + // (Sub-tables are selected using the computed offset from the previous table.) |
| 32 | + let t2_offset = WIDTH_MIDDLE.0[usize::from(t1_offset)][cp >> 7 & 0x3F]; |
| 33 | + |
| 34 | + // Each sub-table in WIDTH_LEAVES is 6 bits, but each stored entry is 2 bits. |
| 35 | + // This is accomplished by packing four stored entries into one byte. |
| 36 | + // So each sub-table is 2**(7-2) == 32 bytes in size. |
| 37 | + // Since this is the last table, each entry represents an encoded width. |
| 38 | + let packed_widths = WIDTH_LEAVES.0[usize::from(t2_offset)][cp >> 2 & 0x1F]; |
| 39 | + |
| 40 | + // Extract the packed width |
| 41 | + let width = packed_widths >> (2 * (cp & 0b11)) & 0b11; |
| 42 | + |
| 43 | + if width < 3 { |
| 44 | + (width, WidthInfo::DEFAULT) |
| 45 | + } else { |
| 46 | + match c { |
| 47 | + '\u{A}' => (1, WidthInfo::LINE_FEED), |
| 48 | + '\u{5DC}' => (1, WidthInfo::HEBREW_LETTER_LAMED), |
| 49 | + '\u{622}'..='\u{882}' => (1, WidthInfo::JOINING_GROUP_ALEF), |
| 50 | + '\u{1780}'..='\u{17AF}' => (1, WidthInfo::KHMER_COENG_ELIGIBLE_LETTER), |
| 51 | + '\u{17D8}' => (3, WidthInfo::DEFAULT), |
| 52 | + '\u{1A10}' => (1, WidthInfo::BUGINESE_LETTER_YA), |
| 53 | + '\u{2D31}'..='\u{2D6F}' => (1, WidthInfo::TIFINAGH_CONSONANT), |
| 54 | + '\u{A4FC}'..='\u{A4FD}' => (1, WidthInfo::LISU_TONE_LETTER_MYA_NA_JEU), |
| 55 | + '\u{FE01}' => (0, WidthInfo::VARIATION_SELECTOR_1_2_OR_3), |
| 56 | + '\u{FE0E}' => (0, WidthInfo::VARIATION_SELECTOR_15), |
| 57 | + '\u{FE0F}' => (0, WidthInfo::VARIATION_SELECTOR_16), |
| 58 | + '\u{10C03}' => (1, WidthInfo::OLD_TURKIC_LETTER_ORKHON_I), |
| 59 | + '\u{16D67}' => (1, WidthInfo::KIRAT_RAI_VOWEL_SIGN_E), |
| 60 | + '\u{16D68}' => (1, WidthInfo::KIRAT_RAI_VOWEL_SIGN_AI), |
| 61 | + '\u{1F1E6}'..='\u{1F1FF}' => (1, WidthInfo::REGIONAL_INDICATOR), |
| 62 | + '\u{1F3FB}'..='\u{1F3FF}' => (2, WidthInfo::EMOJI_MODIFIER), |
| 63 | + _ => (2, WidthInfo::EMOJI_PRESENTATION), |
| 64 | + } |
| 65 | + } |
| 66 | +} |
| 67 | +/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c` by |
| 68 | +/// consulting a multi-level lookup table. |
| 69 | +/// |
| 70 | +/// # Maintenance |
| 71 | +/// The tables themselves are autogenerated but this function is hardcoded. You should have |
| 72 | +/// nothing to worry about if you re-run `unicode.py` (for example, when updating Unicode.) |
| 73 | +/// However, if you change the *actual structure* of the lookup tables (perhaps by editing the |
| 74 | +/// `make_tables` function in `unicode.py`) you must ensure that this code reflects those changes. |
| 75 | +#[cfg(feature = "cjk")] |
| 76 | +#[inline] |
| 77 | +pub(crate) fn lookup_width_cjk(c: char) -> (u8, WidthInfo) { |
| 78 | + let cp = c as usize; |
| 79 | + |
| 80 | + let t1_offset = WIDTH_ROOT_CJK.0[cp >> 13]; |
| 81 | + |
| 82 | + // Each sub-table in WIDTH_MIDDLE is 7 bits, and each stored entry is a byte, |
| 83 | + // so each sub-table is 128 bytes in size. |
| 84 | + // (Sub-tables are selected using the computed offset from the previous table.) |
| 85 | + let t2_offset = WIDTH_MIDDLE.0[usize::from(t1_offset)][cp >> 7 & 0x3F]; |
| 86 | + |
| 87 | + // Each sub-table in WIDTH_LEAVES is 6 bits, but each stored entry is 2 bits. |
| 88 | + // This is accomplished by packing four stored entries into one byte. |
| 89 | + // So each sub-table is 2**(7-2) == 32 bytes in size. |
| 90 | + // Since this is the last table, each entry represents an encoded width. |
| 91 | + let packed_widths = WIDTH_LEAVES.0[usize::from(t2_offset)][cp >> 2 & 0x1F]; |
| 92 | + |
| 93 | + // Extract the packed width |
| 94 | + let width = packed_widths >> (2 * (cp & 0b11)) & 0b11; |
| 95 | + |
| 96 | + if width < 3 { |
| 97 | + (width, WidthInfo::DEFAULT) |
| 98 | + } else { |
| 99 | + match c { |
| 100 | + '\u{A}' => (1, WidthInfo::LINE_FEED), |
| 101 | + '\u{338}' => (0, WidthInfo::COMBINING_LONG_SOLIDUS_OVERLAY), |
| 102 | + '\u{5DC}' => (1, WidthInfo::HEBREW_LETTER_LAMED), |
| 103 | + '\u{622}'..='\u{882}' => (1, WidthInfo::JOINING_GROUP_ALEF), |
| 104 | + '\u{1780}'..='\u{17AF}' => (1, WidthInfo::KHMER_COENG_ELIGIBLE_LETTER), |
| 105 | + '\u{17D8}' => (3, WidthInfo::DEFAULT), |
| 106 | + '\u{1A10}' => (1, WidthInfo::BUGINESE_LETTER_YA), |
| 107 | + '\u{2D31}'..='\u{2D6F}' => (1, WidthInfo::TIFINAGH_CONSONANT), |
| 108 | + '\u{A4FC}'..='\u{A4FD}' => (1, WidthInfo::LISU_TONE_LETTER_MYA_NA_JEU), |
| 109 | + '\u{FE00}'..='\u{FE02}' => (0, WidthInfo::VARIATION_SELECTOR_1_2_OR_3), |
| 110 | + '\u{FE0F}' => (0, WidthInfo::VARIATION_SELECTOR_16), |
| 111 | + '\u{10C03}' => (1, WidthInfo::OLD_TURKIC_LETTER_ORKHON_I), |
| 112 | + '\u{16D67}' => (1, WidthInfo::KIRAT_RAI_VOWEL_SIGN_E), |
| 113 | + '\u{16D68}' => (1, WidthInfo::KIRAT_RAI_VOWEL_SIGN_AI), |
| 114 | + '\u{1F1E6}'..='\u{1F1FF}' => (1, WidthInfo::REGIONAL_INDICATOR), |
| 115 | + '\u{1F3FB}'..='\u{1F3FF}' => (2, WidthInfo::EMOJI_MODIFIER), |
| 116 | + _ => (2, WidthInfo::EMOJI_PRESENTATION), |
| 117 | + } |
| 118 | + } |
| 119 | +} |
0 commit comments