@@ -8,7 +8,7 @@ use std::collections::{HashMap, VecDeque};
88use std:: hash:: { Hash , Hasher } ;
99use std:: sync:: Arc ;
1010
11- type DFACache = HashMap < String , Arc < DFA > > ;
11+ type DFACache = HashMap < Box < [ u8 ] > , Arc < DFA > > ;
1212type DFA = dense:: DFA < Vec < u32 > > ;
1313
1414/// A DFA along with its state. Generic to facilitate experiementation with
@@ -17,7 +17,7 @@ type DFA = dense::DFA<Vec<u32>>;
1717#[ derive( Clone , Debug ) ]
1818pub struct DFAState {
1919 /// The regex representing this dfa.
20- pub regex : Box < str > ,
20+ pub regex : Box < [ u8 ] > ,
2121 /// The actual DFA implementation from the library.
2222 pub dfa : Arc < DFA > ,
2323 /// The state of this DFA. Defaults to the starting state of the DFA.
@@ -43,12 +43,13 @@ impl DFABuilder {
4343
4444 /// Return a DFAState, either from the cache or building a new one from scratch.
4545 // FIXME: Remove the clones from this function to accelerate it further.
46- pub fn build_dfa ( & mut self , regex : String ) -> DFAState {
47- match self . cache . get ( & regex) {
46+ pub fn build_dfa ( & mut self , regex : & [ u8 ] ) -> DFAState {
47+ match self . cache . get ( regex) {
4848 Some ( dfa) => DFAState :: new ( regex, dfa. clone ( ) ) ,
4949 None => {
50- let new_dfa = Arc :: new ( DFA :: new ( & regex) . unwrap ( ) ) ;
51- self . cache . insert ( regex. clone ( ) , new_dfa. clone ( ) ) ;
50+ let regex_str: & str = & std:: str:: from_utf8 ( regex) . unwrap ( ) ;
51+ let new_dfa = Arc :: new ( DFA :: new ( regex_str) . unwrap ( ) ) ;
52+ self . cache . insert ( regex. into ( ) , new_dfa. clone ( ) ) ;
5253 DFAState :: new ( regex, new_dfa)
5354 }
5455 }
@@ -59,30 +60,30 @@ impl DFABuilder {
5960impl DFAState {
6061 /// For the Python interface, make advance return the whole DFAState rather than the StateID.
6162 /// TODO: This is probably a better way to do it than returning the StateID.
62- #[ pyo3( name= "advance" ) ]
63- pub fn py_advance ( & self , input : String ) -> DFAState {
63+ #[ pyo3( name = "advance" ) ]
64+ pub fn py_advance ( & self , input : & [ u8 ] ) -> DFAState {
6465 let mut dfa = self . clone ( ) ;
65- for c in input. chars ( ) {
66- dfa. consume_character ( c ) ;
66+ for b in input {
67+ dfa. consume ( b ) ;
6768 }
6869 dfa. clone ( )
6970 }
7071
7172 #[ getter( state_id) ]
7273 fn state_id ( & self ) -> u32 {
73- self . state_id . as_u32 ( )
74+ self . state_id . as_u32 ( )
7475 }
7576
7677 #[ getter( regex) ]
77- fn regex ( & self ) -> String {
78- self . regex . to_string ( )
78+ fn regex ( & self ) -> & [ u8 ] {
79+ & self . regex
7980 }
8081}
8182
8283/// A dense implementation of the DFAState abstraction.
8384impl DFAState {
8485 /// Encapsulate the kluge necessary to set up the DFA correctly for Syncode's use case.
85- fn new ( regex : String , dfa : Arc < DFA > ) -> DFAState {
86+ fn new ( regex : & [ u8 ] , dfa : Arc < DFA > ) -> DFAState {
8687 // We always want the DFA to match starting from the beginning of the string.
8788 let config = start:: Config :: new ( ) . anchored ( Anchored :: Yes ) ;
8889 let state_id = dfa. start_state ( & config) . unwrap ( ) ;
@@ -94,36 +95,17 @@ impl DFAState {
9495 }
9596
9697 /// Convenience function to set the state how we want it.
97- pub fn advance ( & mut self , input : String ) -> StateID {
98- for c in input. chars ( ) {
99- self . consume_character ( c ) ;
98+ pub fn advance ( & mut self , input : & [ u8 ] ) -> StateID {
99+ for b in input {
100+ self . consume ( b ) ;
100101 }
101102 self . state_id
102103 }
103104
104- /// Consume a character , starting at the current state, setting and
105+ /// Consume a byte , starting at the current state, setting and
105106 /// returning the new state.
106- ///
107- /// The logic here is non-trivial, because UTF-8 characters are a variable
108- /// number of bytes long, and the underlying DFA has bytes as its input
109- /// alphabet.
110- pub fn consume_character ( & mut self , c : char ) -> StateID {
111- let char_len = c. len_utf8 ( ) ;
112- // Buffer to store character as bytes. UFT-8 characters are at most 4
113- // bytes long, so allocate a buffer big enough to store the whole
114- // character regardless of how long it turns out to be.
115- let mut buf = [ 0 ; 4 ] ;
116- c. encode_utf8 ( & mut buf) ;
117- for ( i, & b) in buf. iter ( ) . enumerate ( ) {
118- // The number of bytes per character is variable: we only need to
119- // feed the number of bytes that the character actually is into the
120- // DFA; any more would be incorrect. Break the loop once we've gone
121- // past the end of the character.
122- if i >= char_len {
123- break ;
124- }
125- self . state_id = self . dfa . next_state ( self . state_id , b) ;
126- }
107+ pub fn consume ( & mut self , b : & u8 ) -> StateID {
108+ self . state_id = self . dfa . next_state ( self . state_id , * b) ;
127109 self . state_id
128110 }
129111
@@ -179,14 +161,14 @@ impl Hash for DFAState {
179161 }
180162}
181163/// Compute the union of all states of a list of regexes.
182- pub fn all_dfa_states ( terminals : & Vec < String > ) -> Vec < DFAState > {
164+ pub fn all_dfa_states ( terminals : & Vec < & [ u8 ] > ) -> Vec < DFAState > {
183165 let mut res = Vec :: new ( ) ;
184166 let mut builder = DFABuilder :: new ( ) ;
185167 for terminal in terminals. iter ( ) {
186- let dfa = builder. build_dfa ( terminal. clone ( ) ) ;
168+ let dfa = builder. build_dfa ( terminal) ;
187169 for state in dfa. states ( ) {
188170 res. push ( DFAState {
189- regex : terminal . to_string ( ) . into ( ) ,
171+ regex : ( * terminal ) . into ( ) ,
190172 dfa : dfa. dfa . clone ( ) ,
191173 state_id : state,
192174 } ) ;
@@ -201,40 +183,40 @@ mod tests {
201183
202184 #[ test]
203185 fn test_consume_character_match ( ) {
204- let mut dfa_state = DFABuilder :: new ( ) . build_dfa ( "a" . to_string ( ) ) ;
205- let mut state = dfa_state. consume_character ( 'a' ) ;
186+ let mut dfa_state = DFABuilder :: new ( ) . build_dfa ( b "a") ;
187+ let mut state = dfa_state. consume ( & b"a" [ 0 ] ) ;
206188 state = dfa_state. dfa . next_eoi_state ( state) ;
207189 assert ! ( dfa_state. dfa. is_match_state( state) ) ;
208190 }
209191
210192 #[ test]
211193 fn test_consume_character_fails_to_match ( ) {
212- let mut dfa_state = DFABuilder :: new ( ) . build_dfa ( "a" . to_string ( ) ) ;
213- let mut state = dfa_state. consume_character ( 'b' ) ;
194+ let mut dfa_state = DFABuilder :: new ( ) . build_dfa ( b "a") ;
195+ let mut state = dfa_state. consume ( & b"b" [ 0 ] ) ;
214196 state = dfa_state. dfa . next_eoi_state ( state) ;
215197 assert ! ( !dfa_state. dfa. is_match_state( state) ) ;
216198 }
217199
218200 #[ test]
219201 fn test_advance_match ( ) {
220- let mut dfa_state = DFABuilder :: new ( ) . build_dfa ( "[ab¥]*" . to_string ( ) ) ;
221- let mut state = dfa_state. advance ( "aabb¥aab" . to_string ( ) ) ;
202+ let mut dfa_state = DFABuilder :: new ( ) . build_dfa ( "[ab¥]*" . as_bytes ( ) ) ;
203+ let mut state = dfa_state. advance ( "aabb¥aab" . as_bytes ( ) ) ;
222204 state = dfa_state. dfa . next_eoi_state ( state) ;
223205 assert ! ( dfa_state. dfa. is_match_state( state) ) ;
224206 }
225207
226208 #[ test]
227209 fn test_advance_fails_to_match ( ) {
228- let mut dfa_state = DFABuilder :: new ( ) . build_dfa ( "[ab]*" . to_string ( ) ) ;
229- let mut state = dfa_state. advance ( "aabba¥ab" . to_string ( ) ) ;
210+ let mut dfa_state = DFABuilder :: new ( ) . build_dfa ( "[ab]*" . as_bytes ( ) ) ;
211+ let mut state = dfa_state. advance ( "aabba¥ab" . as_bytes ( ) ) ;
230212 state = dfa_state. dfa . next_eoi_state ( state) ;
231213 assert ! ( !dfa_state. dfa. is_match_state( state) ) ;
232214 }
233215
234216 #[ test]
235217 fn test_advance ( ) {
236- let mut dfa_state = DFABuilder :: new ( ) . build_dfa ( r"[a-zA-Z_]*" . to_string ( ) ) ;
237- let state = dfa_state. advance ( "indeed" . to_string ( ) ) ;
218+ let mut dfa_state = DFABuilder :: new ( ) . build_dfa ( r"[a-zA-Z_]*" . as_bytes ( ) ) ;
219+ let state = dfa_state. advance ( "indeed" . as_bytes ( ) ) ;
238220 assert ! ( dfa_state. dfa. is_match_state( state) ) ;
239221 }
240222}
0 commit comments