Skip to content

Commit 7e3035f

Browse files
committed
Change from str to bytes.
This should make everything easier.
1 parent 68f28ad commit 7e3035f

File tree

2 files changed

+147
-173
lines changed

2 files changed

+147
-173
lines changed

src/dfa.rs

Lines changed: 34 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ use std::collections::{HashMap, VecDeque};
88
use std::hash::{Hash, Hasher};
99
use std::sync::Arc;
1010

11-
type DFACache = HashMap<String, Arc<DFA>>;
11+
type DFACache = HashMap<Box<[u8]>, Arc<DFA>>;
1212
type DFA = dense::DFA<Vec<u32>>;
1313

1414
/// A DFA along with its state. Generic to facilitate experiementation with
@@ -17,7 +17,7 @@ type DFA = dense::DFA<Vec<u32>>;
1717
#[derive(Clone, Debug)]
1818
pub struct DFAState {
1919
/// The regex representing this dfa.
20-
pub regex: Box<str>,
20+
pub regex: Box<[u8]>,
2121
/// The actual DFA implementation from the library.
2222
pub dfa: Arc<DFA>,
2323
/// The state of this DFA. Defaults to the starting state of the DFA.
@@ -43,12 +43,13 @@ impl DFABuilder {
4343

4444
/// Return a DFAState, either from the cache or building a new one from scratch.
4545
// FIXME: Remove the clones from this function to accelerate it further.
46-
pub fn build_dfa(&mut self, regex: String) -> DFAState {
47-
match self.cache.get(&regex) {
46+
pub fn build_dfa(&mut self, regex: &[u8]) -> DFAState {
47+
match self.cache.get(regex) {
4848
Some(dfa) => DFAState::new(regex, dfa.clone()),
4949
None => {
50-
let new_dfa = Arc::new(DFA::new(&regex).unwrap());
51-
self.cache.insert(regex.clone(), new_dfa.clone());
50+
let regex_str: &str = &std::str::from_utf8(regex).unwrap();
51+
let new_dfa = Arc::new(DFA::new(regex_str).unwrap());
52+
self.cache.insert(regex.into(), new_dfa.clone());
5253
DFAState::new(regex, new_dfa)
5354
}
5455
}
@@ -59,30 +60,30 @@ impl DFABuilder {
5960
impl DFAState {
6061
/// For the Python interface, make advance return the whole DFAState rather than the StateID.
6162
/// TODO: This is probably a better way to do it than returning the StateID.
62-
#[pyo3(name="advance")]
63-
pub fn py_advance(&self, input: String) -> DFAState {
63+
#[pyo3(name = "advance")]
64+
pub fn py_advance(&self, input: &[u8]) -> DFAState {
6465
let mut dfa = self.clone();
65-
for c in input.chars() {
66-
dfa.consume_character(c);
66+
for b in input {
67+
dfa.consume(b);
6768
}
6869
dfa.clone()
6970
}
7071

7172
#[getter(state_id)]
7273
fn state_id(&self) -> u32 {
73-
self.state_id.as_u32()
74+
self.state_id.as_u32()
7475
}
7576

7677
#[getter(regex)]
77-
fn regex(&self) -> String {
78-
self.regex.to_string()
78+
fn regex(&self) -> &[u8] {
79+
&self.regex
7980
}
8081
}
8182

8283
/// A dense implementation of the DFAState abstraction.
8384
impl DFAState {
8485
/// Encapsulate the kluge necessary to set up the DFA correctly for Syncode's use case.
85-
fn new(regex: String, dfa: Arc<DFA>) -> DFAState {
86+
fn new(regex: &[u8], dfa: Arc<DFA>) -> DFAState {
8687
// We always want the DFA to match starting from the beginning of the string.
8788
let config = start::Config::new().anchored(Anchored::Yes);
8889
let state_id = dfa.start_state(&config).unwrap();
@@ -94,36 +95,17 @@ impl DFAState {
9495
}
9596

9697
/// Convenience function to set the state how we want it.
97-
pub fn advance(&mut self, input: String) -> StateID {
98-
for c in input.chars() {
99-
self.consume_character(c);
98+
pub fn advance(&mut self, input: &[u8]) -> StateID {
99+
for b in input {
100+
self.consume(b);
100101
}
101102
self.state_id
102103
}
103104

104-
/// Consume a character, starting at the current state, setting and
105+
/// Consume a byte, starting at the current state, setting and
105106
/// returning the new state.
106-
///
107-
/// The logic here is non-trivial, because UTF-8 characters are a variable
108-
/// number of bytes long, and the underlying DFA has bytes as its input
109-
/// alphabet.
110-
pub fn consume_character(&mut self, c: char) -> StateID {
111-
let char_len = c.len_utf8();
112-
// Buffer to store character as bytes. UFT-8 characters are at most 4
113-
// bytes long, so allocate a buffer big enough to store the whole
114-
// character regardless of how long it turns out to be.
115-
let mut buf = [0; 4];
116-
c.encode_utf8(&mut buf);
117-
for (i, &b) in buf.iter().enumerate() {
118-
// The number of bytes per character is variable: we only need to
119-
// feed the number of bytes that the character actually is into the
120-
// DFA; any more would be incorrect. Break the loop once we've gone
121-
// past the end of the character.
122-
if i >= char_len {
123-
break;
124-
}
125-
self.state_id = self.dfa.next_state(self.state_id, b);
126-
}
107+
pub fn consume(&mut self, b: &u8) -> StateID {
108+
self.state_id = self.dfa.next_state(self.state_id, *b);
127109
self.state_id
128110
}
129111

@@ -179,14 +161,14 @@ impl Hash for DFAState {
179161
}
180162
}
181163
/// Compute the union of all states of a list of regexes.
182-
pub fn all_dfa_states(terminals: &Vec<String>) -> Vec<DFAState> {
164+
pub fn all_dfa_states(terminals: &Vec<&[u8]>) -> Vec<DFAState> {
183165
let mut res = Vec::new();
184166
let mut builder = DFABuilder::new();
185167
for terminal in terminals.iter() {
186-
let dfa = builder.build_dfa(terminal.clone());
168+
let dfa = builder.build_dfa(terminal);
187169
for state in dfa.states() {
188170
res.push(DFAState {
189-
regex: terminal.to_string().into(),
171+
regex: (*terminal).into(),
190172
dfa: dfa.dfa.clone(),
191173
state_id: state,
192174
});
@@ -201,40 +183,40 @@ mod tests {
201183

202184
#[test]
203185
fn test_consume_character_match() {
204-
let mut dfa_state = DFABuilder::new().build_dfa("a".to_string());
205-
let mut state = dfa_state.consume_character('a');
186+
let mut dfa_state = DFABuilder::new().build_dfa(b"a");
187+
let mut state = dfa_state.consume(&b"a"[0]);
206188
state = dfa_state.dfa.next_eoi_state(state);
207189
assert!(dfa_state.dfa.is_match_state(state));
208190
}
209191

210192
#[test]
211193
fn test_consume_character_fails_to_match() {
212-
let mut dfa_state = DFABuilder::new().build_dfa("a".to_string());
213-
let mut state = dfa_state.consume_character('b');
194+
let mut dfa_state = DFABuilder::new().build_dfa(b"a");
195+
let mut state = dfa_state.consume(&b"b"[0]);
214196
state = dfa_state.dfa.next_eoi_state(state);
215197
assert!(!dfa_state.dfa.is_match_state(state));
216198
}
217199

218200
#[test]
219201
fn test_advance_match() {
220-
let mut dfa_state = DFABuilder::new().build_dfa("[ab¥]*".to_string());
221-
let mut state = dfa_state.advance("aabb¥aab".to_string());
202+
let mut dfa_state = DFABuilder::new().build_dfa("[ab¥]*".as_bytes());
203+
let mut state = dfa_state.advance("aabb¥aab".as_bytes());
222204
state = dfa_state.dfa.next_eoi_state(state);
223205
assert!(dfa_state.dfa.is_match_state(state));
224206
}
225207

226208
#[test]
227209
fn test_advance_fails_to_match() {
228-
let mut dfa_state = DFABuilder::new().build_dfa("[ab]*".to_string());
229-
let mut state = dfa_state.advance("aabba¥ab".to_string());
210+
let mut dfa_state = DFABuilder::new().build_dfa("[ab]*".as_bytes());
211+
let mut state = dfa_state.advance("aabba¥ab".as_bytes());
230212
state = dfa_state.dfa.next_eoi_state(state);
231213
assert!(!dfa_state.dfa.is_match_state(state));
232214
}
233215

234216
#[test]
235217
fn test_advance() {
236-
let mut dfa_state = DFABuilder::new().build_dfa(r"[a-zA-Z_]*".to_string());
237-
let state = dfa_state.advance("indeed".to_string());
218+
let mut dfa_state = DFABuilder::new().build_dfa(r"[a-zA-Z_]*".as_bytes());
219+
let state = dfa_state.advance("indeed".as_bytes());
238220
assert!(dfa_state.dfa.is_match_state(state));
239221
}
240222
}

0 commit comments

Comments
 (0)