From 64106c4d3d4ddba8c7bc2af75376e6d3d3d75601 Mon Sep 17 00:00:00 2001 From: Date: Mon, 29 Jun 2015 20:16:15 +0000 Subject: Update documentation --- src/aho_corasick/autiter.rs.html | 747 ++++++++++++++++ src/aho_corasick/full.rs.html | 329 +++++++ src/aho_corasick/lib.rs.html | 1823 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 2899 insertions(+) create mode 100644 src/aho_corasick/autiter.rs.html create mode 100644 src/aho_corasick/full.rs.html create mode 100644 src/aho_corasick/lib.rs.html (limited to 'src/aho_corasick') diff --git a/src/aho_corasick/autiter.rs.html b/src/aho_corasick/autiter.rs.html new file mode 100644 index 0000000..af7f372 --- /dev/null +++ b/src/aho_corasick/autiter.rs.html @@ -0,0 +1,747 @@ + + + + + + + + + + autiter.rs.html -- source + + + + + + + + + + + + + + + +
  1
+  2
+  3
+  4
+  5
+  6
+  7
+  8
+  9
+ 10
+ 11
+ 12
+ 13
+ 14
+ 15
+ 16
+ 17
+ 18
+ 19
+ 20
+ 21
+ 22
+ 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
+222
+223
+224
+225
+226
+227
+228
+229
+230
+231
+232
+233
+234
+235
+236
+237
+238
+239
+240
+241
+242
+243
+244
+245
+246
+247
+248
+249
+250
+251
+252
+253
+254
+255
+256
+257
+258
+259
+260
+261
+262
+263
+264
+265
+266
+267
+268
+269
+270
+271
+272
+273
+274
+275
+276
+277
+278
+279
+280
+281
+282
+283
+284
+285
+286
+287
+288
+289
+290
+291
+292
+293
+294
+295
+296
+297
+298
+299
+300
+301
+302
+303
+304
+305
+306
+307
+308
+309
+310
+311
+312
+313
+314
+315
+316
+317
+318
+319
+320
+321
+322
+323
+324
+325
+
+use std::io::{self, BufRead, Read};
+
+use super::{ROOT_STATE, PatIdx, StateIdx};
+
+/// An abstraction over automatons and their corresponding iterators.
+pub trait Automaton: Sized {
+    /// Return the next state given the current state and next character.
+    fn next_state(&self, si: StateIdx, b: u8) -> StateIdx;
+
+    /// Return true if and only if the given state and current pattern index
+    /// indicate a match.
+    fn has_match(&self, si: StateIdx, outi: PatIdx) -> bool;
+
+    /// Build a match given the current state, pattern index and input index.
+    fn get_match(&self, si: StateIdx, outi: PatIdx, texti: usize) -> Match;
+
+    /// Attempt to skip through the input.
+    ///
+    /// This returns the index into `text` at which the next match attempt
+    /// should start. (If no skipping occurred, then the return value should
+    /// be equal to `at`.)
+    ///
+    /// Finally, if no match is possible, then return `text.len()`.
+    fn skip_to(&self, si: StateIdx, text: &[u8], at: usize) -> usize;
+
+    /// Returns true if and only if this automaton can skip through the input.
+    fn is_skippable(&self) -> bool;
+
+    /// Returns all of the patterns matched by this automaton.
+    ///
+    /// The order of the patterns is the order in which they were added.
+    fn patterns(&self) -> &[String];
+
+    /// Returns the pattern indexed at `i`.
+    ///
+    /// The index corresponds to the position at which the pattern was added
+    /// to the automaton, starting at `0`.
+    fn pattern(&self, i: usize) -> &str;
+
+    /// Return the number of patterns in the automaton.
+    #[inline]
+    fn len(&self) -> usize {
+        self.patterns().len()
+    }
+
+    /// Returns true if the automaton has no patterns.
+    #[inline]
+    fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+
+    /// Returns an iterator of non-overlapping matches in `s`.
+    fn find<'a, 's>(
+        &'a self,
+        s: &'s str,
+    ) -> Matches<'a, 's, Self> {
+        Matches {
+            aut: self,
+            text: s.as_bytes(),
+            texti: 0,
+            si: ROOT_STATE,
+        }
+    }
+
+    /// Returns an iterator of overlapping matches in `s`.
+    fn find_overlapping<'a, 's>(
+        &'a self,
+        s: &'s str,
+    ) -> MatchesOverlapping<'a, 's, Self> {
+        MatchesOverlapping {
+            aut: self,
+            text: s.as_bytes(),
+            texti: 0,
+            si: ROOT_STATE,
+            outi: 0,
+        }
+    }
+
+    /// Returns an iterator of non-overlapping matches in the given reader.
+    fn stream_find<'a, R: io::Read>(
+        &'a self,
+        rdr: R,
+    ) -> StreamMatches<'a, R, Self> {
+        StreamMatches {
+            aut: self,
+            buf: io::BufReader::new(rdr),
+            texti: 0,
+            si: ROOT_STATE,
+        }
+    }
+
+    /// Returns an iterator of overlapping matches in the given reader.
+    fn stream_find_overlapping<'a, R: io::Read>(
+        &'a self,
+        rdr: R,
+    ) -> StreamMatchesOverlapping<'a, R, Self> {
+        StreamMatchesOverlapping {
+            aut: self,
+            buf: io::BufReader::new(rdr),
+            texti: 0,
+            si: ROOT_STATE,
+            outi: 0,
+        }
+    }
+}
+
+/// Records a match in the search text.
+#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq)]
+pub struct Match {
+    /// The pattern index.
+    ///
+    /// This corresponds to the ordering in which the matched pattern was
+    /// added to the automaton, starting at `0`.
+    pub pati: usize,
+    /// The starting byte offset of the match in the search text.
+    pub start: usize,
+    /// The ending byte offset of the match in the search text.
+    ///
+    /// (This can be re-captiulated with `pati` and adding the pattern's
+    /// length to `start`, but it is convenient to have it here.)
+    pub end: usize,
+}
+
+/// An iterator of non-overlapping matches for in-memory text.
+///
+/// This iterator yields `Match` values.
+///
+/// `'a` is the lifetime of the automaton and `'s` is the lifetime of the
+/// search text.
+#[derive(Debug)]
+pub struct Matches<'a, 's, A: 'a + Automaton> {
+    aut: &'a A,
+    text: &'s [u8],
+    texti: usize,
+    si: StateIdx,
+}
+
+impl<'a, 's, A: Automaton> Iterator for Matches<'a, 's, A> {
+    type Item = Match;
+
+    fn next(&mut self) -> Option<Match> {
+        // When there's an initial lone start byte, it is usually worth it
+        // to use `memchr` to skip along the input. The problem is that
+        // the skipping function is called in the inner match loop, which
+        // can be quite costly if the skipping condition is never met.
+        // Therefore, we lift the case analysis outside of the main loop at
+        // the cost of repeating code.
+        if self.aut.is_skippable() {
+            self.texti = self.aut.skip_to(self.si, self.text, self.texti);
+            while self.texti < self.text.len() {
+                self.si = self.aut.next_state(self.si, self.text[self.texti]);
+                if self.aut.has_match(self.si, 0) {
+                    let m = self.aut.get_match(self.si, 0, self.texti);
+                    self.si = ROOT_STATE;
+                    self.texti += 1;
+                    return Some(m);
+                }
+                self.texti = self.aut.skip_to(
+                    self.si, self.text, self.texti + 1);
+            }
+        } else {
+            while self.texti < self.text.len() {
+                self.si = self.aut.next_state(self.si, self.text[self.texti]);
+                if self.aut.has_match(self.si, 0) {
+                    let m = self.aut.get_match(self.si, 0, self.texti);
+                    self.si = ROOT_STATE;
+                    self.texti += 1;
+                    return Some(m);
+                }
+                self.texti += 1;
+            }
+        }
+        None
+    }
+}
+
+/// An iterator of non-overlapping matches for streaming text.
+///
+/// This iterator yields `io::Result<Match>` values.
+///
+/// `'a` is the lifetime of the automaton and `R` is the type of the underlying
+/// `io::Read`er.
+#[derive(Debug)]
+pub struct StreamMatches<'a, R, A: 'a + Automaton> {
+    aut: &'a A,
+    buf: io::BufReader<R>,
+    texti: usize,
+    si: StateIdx,
+}
+
+impl<'a, R: io::Read, A: Automaton> Iterator for StreamMatches<'a, R, A> {
+    type Item = io::Result<Match>;
+
+    fn next(&mut self) -> Option<io::Result<Match>> {
+        let mut m = None;
+        let mut consumed = 0;
+'LOOP:  loop {
+            self.buf.consume(consumed);
+            let bs = match self.buf.fill_buf() {
+                Err(err) => return Some(Err(err)),
+                Ok(bs) if bs.len() == 0 => break,
+                Ok(bs) => bs,
+            };
+            consumed = bs.len(); // is shortened if we find a match
+            for (i, &b) in bs.iter().enumerate() {
+                self.si = self.aut.next_state(self.si, b);
+                if self.aut.has_match(self.si, 0) {
+                    m = Some(Ok(self.aut.get_match(self.si, 0, self.texti)));
+                    consumed = i + 1;
+                    self.texti += 1;
+                    self.si = ROOT_STATE;
+                    break 'LOOP;
+                }
+                self.texti += 1;
+            }
+        }
+        self.buf.consume(consumed);
+        m
+    }
+}
+
+/// An iterator of overlapping matches for in-memory text.
+///
+/// This iterator yields `Match` values.
+///
+/// `'a` is the lifetime of the automaton and `'s` is the lifetime of the
+/// search text.
+#[derive(Debug)]
+pub struct MatchesOverlapping<'a, 's, A: 'a + Automaton> {
+    aut: &'a A,
+    text: &'s [u8],
+    texti: usize,
+    si: StateIdx,
+    outi: usize,
+}
+
+impl<'a, 's, A: Automaton> Iterator for MatchesOverlapping<'a, 's, A> {
+    type Item = Match;
+
+    fn next(&mut self) -> Option<Match> {
+        if self.aut.has_match(self.si, self.outi) {
+            let m = self.aut.get_match(self.si, self.outi, self.texti);
+            self.outi += 1;
+            if !self.aut.has_match(self.si, self.outi) {
+                self.texti += 1;
+            }
+            return Some(m);
+        }
+
+        self.outi = 0;
+        self.texti = self.aut.skip_to(self.si, self.text, self.texti);
+        while self.texti < self.text.len() {
+            let b = self.text[self.texti];
+            self.si = self.aut.next_state(self.si, b);
+            if self.aut.has_match(self.si, self.outi) {
+                return self.next();
+            }
+            self.texti = self.aut.skip_to(self.si, self.text, self.texti + 1);
+        }
+        None
+    }
+}
+
+/// An iterator of overlapping matches for streaming text.
+///
+/// This iterator yields `io::Result<Match>` values.
+///
+/// `'a` is the lifetime of the automaton and `R` is the type of the underlying
+/// `io::Read`er.
+#[derive(Debug)]
+pub struct StreamMatchesOverlapping<'a, R, A: 'a + Automaton> {
+    aut: &'a A,
+    buf: io::BufReader<R>,
+    texti: usize,
+    si: StateIdx,
+    outi: usize,
+}
+
+impl<
+    'a,
+    R: io::Read,
+    A: Automaton,
+> Iterator for StreamMatchesOverlapping<'a, R, A> {
+    type Item = io::Result<Match>;
+
+    fn next(&mut self) -> Option<io::Result<Match>> {
+        if self.aut.has_match(self.si, self.outi) {
+            let m = self.aut.get_match(self.si, self.outi, self.texti);
+            self.outi += 1;
+            if !self.aut.has_match(self.si, self.outi) {
+                self.texti += 1;
+            }
+            return Some(Ok(m));
+        }
+        let mut m = None;
+        let mut consumed = 0;
+        self.outi = 0;
+'LOOP:  loop {
+            self.buf.consume(consumed);
+            let bs = match self.buf.fill_buf() {
+                Err(err) => return Some(Err(err)),
+                Ok(bs) if bs.len() == 0 => break,
+                Ok(bs) => bs,
+            };
+            consumed = bs.len(); // is shortened if we find a match
+            for (i, &b) in bs.iter().enumerate() {
+                self.si = self.aut.next_state(self.si, b);
+                if self.aut.has_match(self.si, self.outi) {
+                    m = Some(Ok(self.aut.get_match(self.si, self.outi,
+                                                   self.texti)));
+                    consumed = i + 1;
+                    self.outi += 1;
+                    if !self.aut.has_match(self.si, self.outi) {
+                        self.texti += 1;
+                    }
+                    break 'LOOP;
+                }
+                self.texti += 1;
+            }
+        }
+        self.buf.consume(consumed);
+        m
+    }
+}
+
+
+
+ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/src/aho_corasick/full.rs.html b/src/aho_corasick/full.rs.html new file mode 100644 index 0000000..f9f4f65 --- /dev/null +++ b/src/aho_corasick/full.rs.html @@ -0,0 +1,329 @@ + + + + + + + + + + full.rs.html -- source + + + + + + + + + + + + + + + +
  1
+  2
+  3
+  4
+  5
+  6
+  7
+  8
+  9
+ 10
+ 11
+ 12
+ 13
+ 14
+ 15
+ 16
+ 17
+ 18
+ 19
+ 20
+ 21
+ 22
+ 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+
+use memchr::memchr;
+
+use super::{
+    FAIL_STATE, ROOT_STATE,
+    StateIdx, PatIdx,
+    AcAutomaton, Transitions, Match,
+};
+use super::autiter::Automaton;
+
+/// A complete Aho-Corasick automaton.
+///
+/// This uses a single transition matrix that permits each input character
+/// to move to the next state with a single lookup in the matrix.
+///
+/// This is as fast as it gets, but it is guaranteed to use a lot of memory.
+/// Namely, it will use at least `4 * 256 * #states`, where the number of
+/// states is capped at length of all patterns concatenated.
+#[derive(Clone, Debug)]
+pub struct FullAcAutomaton {
+    pats: Vec<String>,
+    // i * #states + si
+    trans: Vec<StateIdx>,  // row-major, where states are rows
+    out: Vec<Vec<PatIdx>>, // indexed by StateIdx
+    start_bytes: Vec<u8>,
+}
+
+impl FullAcAutomaton {
+    /// Build a new expanded Aho-Corasick automaton from an existing
+    /// Aho-Corasick automaton.
+    pub fn new<T: Transitions>(ac: AcAutomaton<T>) -> FullAcAutomaton {
+        let mut fac = FullAcAutomaton {
+            pats: vec![],
+            trans: vec![FAIL_STATE; 256 * ac.states.len()],
+            out: vec![vec![]; ac.states.len()],
+            start_bytes: vec![],
+        };
+        fac.build_matrix(&ac);
+        fac.pats = ac.pats;
+        fac.start_bytes = ac.start_bytes;
+        fac
+    }
+
+    fn set(&mut self, si: StateIdx, i: u8, goto: StateIdx) {
+        let ns = self.num_states();
+        self.trans[i as usize * ns + si as usize] = goto;
+    }
+
+    #[inline]
+    fn num_states(&self) -> usize {
+        self.out.len()
+    }
+}
+
+impl Automaton for FullAcAutomaton {
+    #[inline]
+    fn next_state(&self, si: StateIdx, i: u8) -> StateIdx {
+        self.trans[i as usize * self.num_states() + si as usize]
+    }
+
+    #[inline]
+    fn get_match(&self, si: StateIdx, outi: usize, texti: usize) -> Match {
+        let pati = self.out[si as usize][outi];
+        let patlen = self.pats[pati].len();
+        let start = texti + 1 - patlen;
+        Match {
+            pati: pati,
+            start: start,
+            end: start + patlen,
+        }
+    }
+
+    #[inline]
+    fn has_match(&self, si: StateIdx, outi: usize) -> bool {
+        outi < self.out[si as usize].len()
+    }
+
+    #[inline]
+    fn skip_to(&self, si: StateIdx, text: &[u8], at: usize) -> usize {
+        if si != ROOT_STATE || !self.is_skippable() {
+            return at;
+        }
+        let b = self.start_bytes[0];
+        match memchr(b, &text[at..]) {
+            None => text.len(),
+            Some(i) => at + i,
+        }
+    }
+
+    #[inline]
+    fn is_skippable(&self) -> bool {
+        self.start_bytes.len() == 1
+    }
+
+    #[inline]
+    fn patterns(&self) -> &[String] {
+        &self.pats
+    }
+
+    #[inline]
+    fn pattern(&self, i: usize) -> &str {
+        &self.pats[i]
+    }
+}
+
+impl FullAcAutomaton {
+    fn build_matrix<T: Transitions>(&mut self, ac: &AcAutomaton<T>) {
+        for (si, s) in ac.states.iter().enumerate().skip(1) {
+            for b in (0..256).map(|b| b as u8) {
+                self.set(si as StateIdx, b, ac.next_state(si as StateIdx, b));
+            }
+            for &pati in &s.out {
+                self.out[si].push(pati);
+            }
+        }
+    }
+}
+
+
+ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/src/aho_corasick/lib.rs.html b/src/aho_corasick/lib.rs.html new file mode 100644 index 0000000..e75bc4d --- /dev/null +++ b/src/aho_corasick/lib.rs.html @@ -0,0 +1,1823 @@ + + + + + + + + + + lib.rs.html -- source + + + + + + + + + + + + + + + +
  1
+  2
+  3
+  4
+  5
+  6
+  7
+  8
+  9
+ 10
+ 11
+ 12
+ 13
+ 14
+ 15
+ 16
+ 17
+ 18
+ 19
+ 20
+ 21
+ 22
+ 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
+222
+223
+224
+225
+226
+227
+228
+229
+230
+231
+232
+233
+234
+235
+236
+237
+238
+239
+240
+241
+242
+243
+244
+245
+246
+247
+248
+249
+250
+251
+252
+253
+254
+255
+256
+257
+258
+259
+260
+261
+262
+263
+264
+265
+266
+267
+268
+269
+270
+271
+272
+273
+274
+275
+276
+277
+278
+279
+280
+281
+282
+283
+284
+285
+286
+287
+288
+289
+290
+291
+292
+293
+294
+295
+296
+297
+298
+299
+300
+301
+302
+303
+304
+305
+306
+307
+308
+309
+310
+311
+312
+313
+314
+315
+316
+317
+318
+319
+320
+321
+322
+323
+324
+325
+326
+327
+328
+329
+330
+331
+332
+333
+334
+335
+336
+337
+338
+339
+340
+341
+342
+343
+344
+345
+346
+347
+348
+349
+350
+351
+352
+353
+354
+355
+356
+357
+358
+359
+360
+361
+362
+363
+364
+365
+366
+367
+368
+369
+370
+371
+372
+373
+374
+375
+376
+377
+378
+379
+380
+381
+382
+383
+384
+385
+386
+387
+388
+389
+390
+391
+392
+393
+394
+395
+396
+397
+398
+399
+400
+401
+402
+403
+404
+405
+406
+407
+408
+409
+410
+411
+412
+413
+414
+415
+416
+417
+418
+419
+420
+421
+422
+423
+424
+425
+426
+427
+428
+429
+430
+431
+432
+433
+434
+435
+436
+437
+438
+439
+440
+441
+442
+443
+444
+445
+446
+447
+448
+449
+450
+451
+452
+453
+454
+455
+456
+457
+458
+459
+460
+461
+462
+463
+464
+465
+466
+467
+468
+469
+470
+471
+472
+473
+474
+475
+476
+477
+478
+479
+480
+481
+482
+483
+484
+485
+486
+487
+488
+489
+490
+491
+492
+493
+494
+495
+496
+497
+498
+499
+500
+501
+502
+503
+504
+505
+506
+507
+508
+509
+510
+511
+512
+513
+514
+515
+516
+517
+518
+519
+520
+521
+522
+523
+524
+525
+526
+527
+528
+529
+530
+531
+532
+533
+534
+535
+536
+537
+538
+539
+540
+541
+542
+543
+544
+545
+546
+547
+548
+549
+550
+551
+552
+553
+554
+555
+556
+557
+558
+559
+560
+561
+562
+563
+564
+565
+566
+567
+568
+569
+570
+571
+572
+573
+574
+575
+576
+577
+578
+579
+580
+581
+582
+583
+584
+585
+586
+587
+588
+589
+590
+591
+592
+593
+594
+595
+596
+597
+598
+599
+600
+601
+602
+603
+604
+605
+606
+607
+608
+609
+610
+611
+612
+613
+614
+615
+616
+617
+618
+619
+620
+621
+622
+623
+624
+625
+626
+627
+628
+629
+630
+631
+632
+633
+634
+635
+636
+637
+638
+639
+640
+641
+642
+643
+644
+645
+646
+647
+648
+649
+650
+651
+652
+653
+654
+655
+656
+657
+658
+659
+660
+661
+662
+663
+664
+665
+666
+667
+668
+669
+670
+671
+672
+673
+674
+675
+676
+677
+678
+679
+680
+681
+682
+683
+684
+685
+686
+687
+688
+689
+690
+691
+692
+693
+694
+695
+696
+697
+698
+699
+700
+701
+702
+703
+704
+705
+706
+707
+708
+709
+710
+711
+712
+713
+714
+715
+716
+717
+718
+719
+720
+721
+722
+723
+724
+725
+726
+727
+728
+729
+730
+731
+732
+733
+734
+735
+736
+737
+738
+739
+740
+741
+742
+743
+744
+745
+746
+747
+748
+749
+750
+751
+752
+753
+754
+755
+756
+757
+758
+759
+760
+761
+762
+763
+764
+765
+766
+767
+768
+769
+770
+771
+772
+773
+774
+775
+776
+777
+778
+779
+780
+781
+782
+783
+784
+785
+786
+787
+788
+789
+790
+791
+792
+793
+794
+795
+796
+797
+798
+799
+800
+801
+802
+803
+804
+805
+806
+807
+808
+809
+810
+811
+812
+813
+814
+815
+816
+817
+818
+819
+820
+821
+822
+823
+824
+825
+826
+827
+828
+829
+830
+831
+832
+833
+834
+835
+836
+837
+838
+839
+840
+841
+842
+843
+844
+845
+846
+847
+848
+849
+850
+851
+852
+853
+854
+855
+856
+857
+858
+859
+860
+861
+862
+863
+
+/*!
+An implementation of the
+[Aho-Corasick string search algorithm](https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm).
+
+The Aho-Corasick algorithm is principally useful when you need to search many
+large texts for a fixed (possibly large) set of keywords. In particular, the
+Aho-Corasick algorithm preprocesses the set of keywords by constructing a
+finite state machine. The search phase is then a quick linear scan through the
+text. Each character in the search text causes a state transition in the
+automaton. Matches are reported when the automaton enters a match state.
+
+# Examples
+
+The main type exposed by this crate is `AcAutomaton`, which can be constructed
+from an iterator of pattern strings:
+
+```rust
+use aho_corasick::{Automaton, AcAutomaton};
+
+let aut = AcAutomaton::new(vec!["apple", "maple"]);
+
+// AcAutomaton also implements `FromIterator`:
+let aut: AcAutomaton = ["apple", "maple"].iter().cloned().collect();
+```
+
+Finding matches can be done with `find`:
+
+```rust
+use aho_corasick::{Automaton, AcAutomaton, Match};
+
+let aut = AcAutomaton::new(vec!["apple", "maple"]);
+let mut it = aut.find("I like maple apples.");
+assert_eq!(it.next(), Some(Match {
+    pati: 1,
+    start: 7,
+    end: 12,
+}));
+assert_eq!(it.next(), Some(Match {
+    pati: 0,
+    start: 13,
+    end: 18,
+}));
+assert_eq!(it.next(), None);
+```
+
+Use `find_overlapping` if you want to report all matches, even if they
+overlap with each other.
+
+```rust
+use aho_corasick::{Automaton, AcAutomaton, Match};
+
+let aut = AcAutomaton::new(vec!["abc", "a"]);
+let matches: Vec<_> = aut.find_overlapping("abc").collect();
+assert_eq!(matches, vec![
+    Match { pati: 1, start: 0, end: 1}, Match { pati: 0, start: 0, end: 3 },
+]);
+
+// Regular `find` will report only one match:
+let matches: Vec<_> = aut.find("abc").collect();
+assert_eq!(matches, vec![Match { pati: 1, start: 0, end: 1}]);
+```
+
+Finally, there are also methods for finding matches on *streams*. Namely, the
+search text does not have to live in memory. It's useful to run this on files
+that can't fit into memory:
+
+```no_run
+use std::fs::File;
+
+use aho_corasick::{Automaton, AcAutomaton};
+
+let aut = AcAutomaton::new(vec!["foo", "bar", "baz"]);
+let rdr = File::open("search.txt").unwrap();
+for m in aut.stream_find(rdr) {
+    let m = m.unwrap(); // could be an IO error
+    println!("Pattern '{}' matched at: ({}, {})",
+             aut.pattern(m.pati), m.start, m.end);
+}
+```
+
+There is also `stream_find_overlapping`, which is just like `find_overlapping`,
+but it operates on streams.
+
+Please see `dict-search.rs` in this crate's `examples` directory for a more
+complete example. It creates a large automaton from a dictionary and can do a
+streaming match over arbitrarily large data.
+
+# Memory usage
+
+A key aspect of an Aho-Corasick implementation is how the state transitions
+are represented. The easiest way to make the automaton fast is to store a
+sparse 256-slot map in each state. It maps an input byte to a state index.
+This makes the matching loop extremely fast, since it translates to a simple
+pointer read.
+
+The problem is that as the automaton accumulates more states, you end up paying
+a `256 * 4` (`4` is for the `u32` state index) byte penalty for every state
+regardless of how many transitions it has.
+
+To solve this, only states near the root of the automaton have this sparse
+map representation. States near the leaves of the automaton use a dense mapping
+that requires a linear scan.
+
+(The specific limit currently set is `3`, so that states with a depth less than
+or equal to `3` are less memory efficient. The result is that the memory usage
+of the automaton stops growing rapidly past ~60MB, even for automatons with
+thousands of patterns.)
+
+If you'd like to opt for the less-memory-efficient-but-faster version, then
+you can construct an `AcAutomaton` with a `Sparse` transition strategy:
+
+```rust
+use aho_corasick::{Automaton, AcAutomaton, Match, Sparse};
+
+let aut = AcAutomaton::<Sparse>::with_transitions(vec!["abc", "a"]);
+let matches: Vec<_> = aut.find("abc").collect();
+assert_eq!(matches, vec![Match { pati: 1, start: 0, end: 1}]);
+```
+*/
+
+#![deny(missing_docs)]
+
+extern crate memchr;
+#[cfg(test)] extern crate quickcheck;
+#[cfg(test)] extern crate rand;
+
+use std::collections::VecDeque;
+use std::fmt;
+use std::iter::FromIterator;
+
+use memchr::memchr;
+
+pub use self::autiter::{
+    Automaton, Match,
+    Matches, MatchesOverlapping, StreamMatches, StreamMatchesOverlapping,
+};
+pub use self::full::FullAcAutomaton;
+
+// We're specifying paths explicitly so that we can use
+// these modules simultaneously from `main.rs`.
+// Should probably make just make `main.rs` a separate crate.
+#[path = "autiter.rs"]
+mod autiter;
+#[path = "full.rs"]
+mod full;
+
+/// The integer type used for the state index.
+///
+/// Limiting this to 32 bit integers can have a big impact on memory usage
+/// when using the `Sparse` transition representation.
+pub type StateIdx = u32;
+
+type PatIdx = usize;
+
+// Constants for special state indexes.
+const FAIL_STATE: u32 = 0;
+const ROOT_STATE: u32 = 1;
+
+// Limit the depth at which we use a dense alphabet map. Once the limit is
+// reached, a sparse set is used (and lookup becomes O(n)).
+//
+// This does have a performance hit, but the (straight forward) alternative
+// is to have a `256 * 4` byte overhead for every state.
+// Given that Aho-Corasick is typically used for dictionary searching, this
+// can lead to dramatic memory bloat.
+//
+// This limit should only be increased at your peril. Namely, in the worst
+// case, `256^DENSE_DEPTH_THRESHOLD * 4` corresponds to the memory usage in
+// bytes. A value of `3` gives us a solid cap at around 64MB, which works
+// well in practice even for dictionary sized automatons.
+//
+// Why not make this user configurable? Well, it doesn't make much sense
+// because we pay for it with case analysis in the matching loop. Increasing it
+// doesn't have much impact on performance (outside of pathological cases?).
+//
+// There is probably room for adding a new automaton type that doesn't have
+// this restriction.
+//
+// N.B. Someone else seems to have discovered an alternative, but I haven't
+// grokked it yet: https://github.com/mischasan/aho-corasick
+const DENSE_DEPTH_THRESHOLD: u32 = 3;
+
+/// An Aho-Corasick finite automaton.
+#[derive(Clone)]
+pub struct AcAutomaton<T=Dense> {
+    pats: Vec<String>,
+    states: Vec<State<T>>,
+    start_bytes: Vec<u8>,
+}
+
+#[derive(Clone)]
+struct State<T> {
+    out: Vec<PatIdx>,
+    fail: StateIdx,
+    goto: T,
+    depth: u32,
+}
+
+impl AcAutomaton {
+    /// Create a new automaton from an iterator of patterns.
+    ///
+    /// The patterns must be convertible to Unicode `String` values via the
+    /// `Into` trait.
+    pub fn new<S, I>(pats: I) -> AcAutomaton<Dense>
+            where S: Into<String>, I: IntoIterator<Item=S> {
+        AcAutomaton::with_transitions(pats)
+    }
+}
+
+impl<T: Transitions> AcAutomaton<T> {
+    /// Create a new automaton from an iterator of patterns.
+    ///
+    /// This constructor allows one to choose the transition representation.
+    ///
+    /// The patterns must be convertible to Unicode `String` values via the
+    /// `Into` trait.
+    pub fn with_transitions<S, I>(pats: I) -> AcAutomaton<T>
+            where S: Into<String>, I: IntoIterator<Item=S> {
+        AcAutomaton {
+            pats: vec![], // filled in later, avoid wrath of borrow checker
+            states: vec![State::new(0), State::new(0)], // empty and root
+            start_bytes: vec![], // also filled in later
+        }.build(pats.into_iter().map(Into::into).collect())
+    }
+
+    /// Build out the entire automaton into a single matrix.
+    ///
+    /// This will make searching as fast as possible at the expense of using
+    /// at least `4 * 256 * #states` bytes of memory.
+    pub fn into_full(self) -> FullAcAutomaton {
+        FullAcAutomaton::new(self)
+    }
+}
+
+impl<T: Transitions> Automaton for AcAutomaton<T> {
+    #[inline]
+    fn next_state(&self, mut si: StateIdx, b: u8) -> StateIdx {
+        loop {
+            let maybe_si = self.states[si as usize].goto(b);
+            if maybe_si != FAIL_STATE {
+                si = maybe_si;
+                break;
+            } else {
+                si = self.states[si as usize].fail;
+            }
+        }
+        si
+    }
+
+    #[inline]
+    fn get_match(&self, si: StateIdx, outi: usize, texti: usize) -> Match {
+        let pati = self.states[si as usize].out[outi];
+        let patlen = self.pats[pati].len();
+        let start = texti + 1 - patlen;
+        Match {
+            pati: pati,
+            start: start,
+            end: start + patlen,
+        }
+    }
+
+    #[inline]
+    fn has_match(&self, si: StateIdx, outi: usize) -> bool {
+        outi < self.states[si as usize].out.len()
+    }
+
+    #[inline]
+    fn skip_to(&self, si: StateIdx, text: &[u8], at: usize) -> usize {
+        if si != ROOT_STATE || !self.is_skippable() {
+            return at;
+        }
+        let b = self.start_bytes[0];
+        match memchr(b, &text[at..]) {
+            None => text.len(),
+            Some(i) => at + i,
+        }
+    }
+
+    #[inline]
+    fn is_skippable(&self) -> bool {
+        self.start_bytes.len() == 1
+    }
+
+    #[inline]
+    fn patterns(&self) -> &[String] {
+        &self.pats
+    }
+
+    #[inline]
+    fn pattern(&self, i: usize) -> &str {
+        &self.pats[i]
+    }
+}
+
+// Below contains code for *building* the automaton. It's a reasonably faithful
+// translation of the description/psuedo-code from:
+// http://www.cs.uku.fi/~kilpelai/BSA05/lectures/slides04.pdf
+
+impl<T: Transitions> AcAutomaton<T> {
+    // This is the first phase and builds the initial keyword tree.
+    fn build(mut self, pats: Vec<String>) -> AcAutomaton<T> {
+        for (pati, pat) in pats.iter().enumerate() {
+            if pat.is_empty() {
+                continue;
+            }
+            let mut previ = ROOT_STATE;
+            for &b in pat.as_bytes() {
+                if self.states[previ as usize].goto(b) != FAIL_STATE {
+                    previ = self.states[previ as usize].goto(b);
+                } else {
+                    let depth = self.states[previ as usize].depth + 1;
+                    let nexti = self.add_state(State::new(depth));
+                    self.states[previ as usize].set_goto(b, nexti);
+                    previ = nexti;
+                }
+            }
+            self.states[previ as usize].out.push(pati);
+        }
+        for c in (0..256).into_iter().map(|c| c as u8) {
+            if self.states[ROOT_STATE as usize].goto(c) == FAIL_STATE {
+                self.states[ROOT_STATE as usize].set_goto(c, ROOT_STATE);
+            } else {
+                self.start_bytes.push(c);
+            }
+        }
+        self.pats = pats;
+        self.fill()
+    }
+
+    // The second phase that fills in the back links.
+    fn fill(mut self) -> AcAutomaton<T> {
+        // Fill up the queue with all non-root transitions out of the root
+        // node. Then proceed by breadth first traversal.
+        let mut q = VecDeque::new();
+        for c in (0..256).into_iter().map(|c| c as u8) {
+            let si = self.states[ROOT_STATE as usize].goto(c);
+            if si != ROOT_STATE {
+                q.push_front(si);
+            }
+        }
+        while let Some(si) = q.pop_back() {
+            for c in (0..256).into_iter().map(|c| c as u8) {
+                let u = self.states[si as usize].goto(c);
+                if u != FAIL_STATE {
+                    q.push_front(u);
+                    let mut v = self.states[si as usize].fail;
+                    while self.states[v as usize].goto(c) == FAIL_STATE {
+                        v = self.states[v as usize].fail;
+                    }
+                    let ufail = self.states[v as usize].goto(c);
+                    self.states[u as usize].fail = ufail;
+                    let ufail_out = self.states[ufail as usize].out.clone();
+                    self.states[u as usize].out.extend(ufail_out);
+                }
+            }
+        }
+        self
+    }
+
+    fn add_state(&mut self, state: State<T>) -> StateIdx {
+        let i = self.states.len();
+        self.states.push(state);
+        i as StateIdx
+    }
+}
+
+impl<T: Transitions> State<T> {
+    fn new(depth: u32) -> State<T> {
+        State {
+            out: vec![],
+            fail: 1,
+            goto: Transitions::new(depth),
+            depth: depth,
+        }
+    }
+
+    fn goto(&self, b: u8) -> StateIdx { self.goto.goto(b) }
+    fn set_goto(&mut self, b: u8, si: StateIdx) { self.goto.set_goto(b, si); }
+}
+
+/// An abstraction over state transition strategies.
+///
+/// This is an attempt to let the caller choose the space/time trade offs
+/// used for state transitions.
+///
+/// (It's possible that this interface is merely good enough for just the two
+/// implementations in this crate.)
+pub trait Transitions {
+    /// Return a new state at the given depth.
+    fn new(depth: u32) -> Self;
+    /// Return the next state index given the next character.
+    fn goto(&self, alpha: u8) -> StateIdx;
+    /// Set the next state index for the character given.
+    fn set_goto(&mut self, alpha: u8, si: StateIdx);
+}
+
+/// State transitions that can be stored either sparsely or densely.
+///
+/// This uses less space but at the expense of slower matching.
+#[derive(Clone, Debug)]
+pub struct Dense(DenseChoice);
+
+#[derive(Clone, Debug)]
+enum DenseChoice {
+    Sparse(Vec<StateIdx>), // indexed by alphabet
+    Dense(Vec<(u8, StateIdx)>),
+}
+
+impl Transitions for Dense {
+    fn new(depth: u32) -> Dense {
+        if depth <= DENSE_DEPTH_THRESHOLD {
+            Dense(DenseChoice::Sparse(vec![0; 256]))
+        } else {
+            Dense(DenseChoice::Dense(vec![]))
+        }
+    }
+
+    fn goto(&self, b1: u8) -> StateIdx {
+        match self.0 {
+            DenseChoice::Sparse(ref m) => m[b1 as usize],
+            DenseChoice::Dense(ref m) => {
+                for &(b2, si) in m {
+                    if b1 == b2 {
+                        return si;
+                    }
+                }
+                FAIL_STATE
+            }
+        }
+    }
+
+    fn set_goto(&mut self, b: u8, si: StateIdx) {
+        match self.0 {
+            DenseChoice::Sparse(ref mut m) => m[b as usize] = si,
+            DenseChoice::Dense(ref mut m) => m.push((b, si)),
+        }
+    }
+}
+
+/// State transitions that are always sparse.
+///
+/// This can use enormous amounts of memory when there are many patterns,
+/// but matching is very fast.
+#[derive(Clone, Debug)]
+pub struct Sparse(Vec<StateIdx>);
+
+impl Transitions for Sparse {
+    fn new(_: u32) -> Sparse {
+        Sparse(vec![0; 256])
+    }
+
+    #[inline]
+    fn goto(&self, b: u8) -> StateIdx {
+        self.0[b as usize]
+    }
+
+    fn set_goto(&mut self, b: u8, si: StateIdx) {
+        self.0[b as usize] = si;
+    }
+}
+
+impl<S: Into<String>> FromIterator<S> for AcAutomaton {
+    /// Create an automaton from an iterator of strings.
+    fn from_iter<T>(it: T) -> AcAutomaton where T: IntoIterator<Item=S> {
+        AcAutomaton::new(it)
+    }
+}
+
+// Provide some question debug impls for viewing automatons.
+// The custom impls mostly exist for special showing of sparse maps.
+
+impl<T: Transitions> fmt::Debug for AcAutomaton<T> {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        use std::iter::repeat;
+
+        try!(writeln!(f, "{}", repeat('-').take(79).collect::<String>()));
+        try!(writeln!(f, "Patterns: {:?}", self.pats));
+        for (i, state) in self.states.iter().enumerate().skip(1) {
+            try!(writeln!(f, "{:3}: {}", i, state.debug(i == 1)));
+        }
+        write!(f, "{}", repeat('-').take(79).collect::<String>())
+    }
+}
+
+impl<T: Transitions> State<T> {
+    fn debug(&self, root: bool) -> String {
+        format!("State {{ depth: {:?}, out: {:?}, fail: {:?}, goto: {{{}}} }}",
+                self.depth, self.out, self.fail, self.goto_string(root))
+    }
+
+    fn goto_string(&self, root: bool) -> String {
+        use std::char::from_u32;
+
+        let mut goto = vec![];
+        for b in (0..256).map(|b| b as u8) {
+            let si = self.goto(b);
+            if (!root && si == FAIL_STATE) || (root && si == ROOT_STATE) {
+                continue;
+            }
+            goto.push(format!("{} => {}", from_u32(b as u32).unwrap(), si));
+        }
+        goto.connect(", ")
+    }
+}
+
+impl<T: Transitions> fmt::Debug for State<T> {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "{}", self.debug(false))
+    }
+}
+
+impl<T: Transitions> AcAutomaton<T> {
+    #[doc(hidden)]
+    pub fn dot(&self) -> String {
+        use std::fmt::Write;
+        let mut out = String::new();
+        macro_rules! w {
+            ($w:expr, $($tt:tt)*) => { {write!($w, $($tt)*)}.unwrap() }
+        }
+
+        w!(out, r#"
+digraph automaton {{
+    label=<<FONT POINT-SIZE="20">{}</FONT>>;
+    labelloc="l";
+    labeljust="l";
+    rankdir="LR";
+"#, self.pats.connect(", "));
+        for (i, s) in self.states.iter().enumerate().skip(1) {
+            let i = i as u32;
+            if s.out.len() == 0 {
+                w!(out, "    {};\n", i);
+            } else {
+                w!(out, "    {} [peripheries=2];\n", i);
+            }
+            w!(out, "    {} -> {} [style=dashed];\n", i, s.fail);
+            for b in (0..256).map(|b| b as u8) {
+                let si = s.goto(b);
+                if si == FAIL_STATE || (i == ROOT_STATE && si == ROOT_STATE) {
+                    continue;
+                }
+                w!(out, "    {} -> {} [label={}];\n", i, si, b as char);
+            }
+        }
+        w!(out, "}}");
+        out
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::collections::HashSet;
+    use std::io;
+
+    use quickcheck::{Arbitrary, Gen, quickcheck};
+    use rand::Rng;
+
+    use super::{Automaton, AcAutomaton, Match};
+
+    fn aut_find<S>(xs: &[S], haystack: &str) -> Vec<Match>
+            where S: Clone + Into<String> {
+        AcAutomaton::new(xs.to_vec()).find(&haystack).collect()
+    }
+
+    fn aut_finds<S>(xs: &[S], haystack: &str) -> Vec<Match>
+            where S: Clone + Into<String> {
+        let cur = io::Cursor::new(haystack.as_bytes());
+        AcAutomaton::new(xs.to_vec())
+            .stream_find(cur).map(|r| r.unwrap()).collect()
+    }
+
+    fn aut_findf<S>(xs: &[S], haystack: &str) -> Vec<Match>
+            where S: Clone + Into<String> {
+        AcAutomaton::new(xs.to_vec()).into_full().find(haystack).collect()
+    }
+
+    fn aut_findfs<S>(xs: &[S], haystack: &str) -> Vec<Match>
+            where S: Clone + Into<String> {
+        let cur = io::Cursor::new(haystack.as_bytes());
+        AcAutomaton::new(xs.to_vec())
+            .into_full()
+            .stream_find(cur).map(|r| r.unwrap()).collect()
+    }
+
+    fn aut_findo<S>(xs: &[S], haystack: &str) -> Vec<Match>
+            where S: Clone + Into<String> {
+        AcAutomaton::new(xs.to_vec()).find_overlapping(haystack).collect()
+    }
+
+    fn aut_findos<S>(xs: &[S], haystack: &str) -> Vec<Match>
+            where S: Clone + Into<String> {
+        let cur = io::Cursor::new(haystack.as_bytes());
+        AcAutomaton::new(xs.to_vec())
+            .stream_find_overlapping(cur).map(|r| r.unwrap()).collect()
+    }
+
+    fn aut_findfo<S>(xs: &[S], haystack: &str) -> Vec<Match>
+            where S: Clone + Into<String> {
+        AcAutomaton::new(xs.to_vec())
+            .into_full().find_overlapping(haystack).collect()
+    }
+
+    fn aut_findfos<S>(xs: &[S], haystack: &str) -> Vec<Match>
+            where S: Clone + Into<String> {
+        let cur = io::Cursor::new(haystack.as_bytes());
+        AcAutomaton::new(xs.to_vec())
+            .into_full()
+            .stream_find_overlapping(cur).map(|r| r.unwrap()).collect()
+    }
+
+    #[test]
+    fn one_pattern_one_match() {
+        let ns = vec!["a"];
+        let hay = "za";
+        let matches = vec![
+            Match { pati: 0, start: 1, end: 2 },
+        ];
+        assert_eq!(&aut_find(&ns, hay), &matches);
+        assert_eq!(&aut_finds(&ns, hay), &matches);
+        assert_eq!(&aut_findf(&ns, hay), &matches);
+        assert_eq!(&aut_findfs(&ns, hay), &matches);
+    }
+
+    #[test]
+    fn one_pattern_many_match() {
+        let ns = vec!["a"];
+        let hay = "zazazzzza";
+        let matches = vec![
+            Match { pati: 0, start: 1, end: 2 },
+            Match { pati: 0, start: 3, end: 4 },
+            Match { pati: 0, start: 8, end: 9 },
+        ];
+        assert_eq!(&aut_find(&ns, hay), &matches);
+        assert_eq!(&aut_finds(&ns, hay), &matches);
+        assert_eq!(&aut_findf(&ns, hay), &matches);
+        assert_eq!(&aut_findfs(&ns, hay), &matches);
+    }
+
+    #[test]
+    fn one_longer_pattern_one_match() {
+        let ns = vec!["abc"];
+        let hay = "zazabcz";
+        let matches = vec![ Match { pati: 0, start: 3, end: 6 } ];
+        assert_eq!(&aut_find(&ns, hay), &matches);
+        assert_eq!(&aut_finds(&ns, hay), &matches);
+        assert_eq!(&aut_findf(&ns, hay), &matches);
+        assert_eq!(&aut_findfs(&ns, hay), &matches);
+    }
+
+    #[test]
+    fn one_longer_pattern_many_match() {
+        let ns = vec!["abc"];
+        let hay = "zazabczzzzazzzabc";
+        let matches = vec![
+            Match { pati: 0, start: 3, end: 6 },
+            Match { pati: 0, start: 14, end: 17 },
+        ];
+        assert_eq!(&aut_find(&ns, hay), &matches);
+        assert_eq!(&aut_finds(&ns, hay), &matches);
+        assert_eq!(&aut_findf(&ns, hay), &matches);
+        assert_eq!(&aut_findfs(&ns, hay), &matches);
+    }
+
+    #[test]
+    fn many_pattern_one_match() {
+        let ns = vec!["a", "b"];
+        let hay = "zb";
+        let matches = vec![ Match { pati: 1, start: 1, end: 2 } ];
+        assert_eq!(&aut_find(&ns, hay), &matches);
+        assert_eq!(&aut_finds(&ns, hay), &matches);
+        assert_eq!(&aut_findf(&ns, hay), &matches);
+        assert_eq!(&aut_findfs(&ns, hay), &matches);
+    }
+
+    #[test]
+    fn many_pattern_many_match() {
+        let ns = vec!["a", "b"];
+        let hay = "zbzazzzzb";
+        let matches = vec![
+            Match { pati: 1, start: 1, end: 2 },
+            Match { pati: 0, start: 3, end: 4 },
+            Match { pati: 1, start: 8, end: 9 },
+        ];
+        assert_eq!(&aut_find(&ns, hay), &matches);
+        assert_eq!(&aut_finds(&ns, hay), &matches);
+        assert_eq!(&aut_findf(&ns, hay), &matches);
+        assert_eq!(&aut_findfs(&ns, hay), &matches);
+    }
+
+    #[test]
+    fn many_longer_pattern_one_match() {
+        let ns = vec!["abc", "xyz"];
+        let hay = "zazxyzz";
+        let matches = vec![ Match { pati: 1, start: 3, end: 6 } ];
+        assert_eq!(&aut_find(&ns, hay), &matches);
+        assert_eq!(&aut_finds(&ns, hay), &matches);
+        assert_eq!(&aut_findf(&ns, hay), &matches);
+        assert_eq!(&aut_findfs(&ns, hay), &matches);
+    }
+
+    #[test]
+    fn many_longer_pattern_many_match() {
+        let ns = vec!["abc", "xyz"];
+        let hay = "zazxyzzzzzazzzabcxyz";
+        let matches = vec![
+            Match { pati: 1, start: 3, end: 6 },
+            Match { pati: 0, start: 14, end: 17 },
+            Match { pati: 1, start: 17, end: 20 },
+        ];
+        assert_eq!(&aut_find(&ns, hay), &matches);
+        assert_eq!(&aut_finds(&ns, hay), &matches);
+        assert_eq!(&aut_findf(&ns, hay), &matches);
+        assert_eq!(&aut_findfs(&ns, hay), &matches);
+    }
+
+    #[test]
+    fn many_longer_pattern_overlap_one_match() {
+        let ns = vec!["abc", "bc"];
+        let hay = "zazabcz";
+        let matches = vec![
+            Match { pati: 0, start: 3, end: 6 },
+            Match { pati: 1, start: 4, end: 6 },
+        ];
+        assert_eq!(&aut_findo(&ns, hay), &matches);
+        assert_eq!(&aut_findos(&ns, hay), &matches);
+        assert_eq!(&aut_findfo(&ns, hay), &matches);
+        assert_eq!(&aut_findfos(&ns, hay), &matches);
+    }
+
+    #[test]
+    fn many_longer_pattern_overlap_one_match_reverse() {
+        let ns = vec!["abc", "bc"];
+        let hay = "xbc";
+        let matches = vec![ Match { pati: 1, start: 1, end: 3 } ];
+        assert_eq!(&aut_findo(&ns, hay), &matches);
+        assert_eq!(&aut_findos(&ns, hay), &matches);
+        assert_eq!(&aut_findfo(&ns, hay), &matches);
+        assert_eq!(&aut_findfos(&ns, hay), &matches);
+    }
+
+    #[test]
+    fn many_longer_pattern_overlap_many_match() {
+        let ns = vec!["abc", "bc", "c"];
+        let hay = "zzzabczzzbczzzc";
+        let matches = vec![
+            Match { pati: 0, start: 3, end: 6 },
+            Match { pati: 1, start: 4, end: 6 },
+            Match { pati: 2, start: 5, end: 6 },
+            Match { pati: 1, start: 9, end: 11 },
+            Match { pati: 2, start: 10, end: 11 },
+            Match { pati: 2, start: 14, end: 15 },
+        ];
+        assert_eq!(&aut_findo(&ns, hay), &matches);
+        assert_eq!(&aut_findos(&ns, hay), &matches);
+        assert_eq!(&aut_findfo(&ns, hay), &matches);
+        assert_eq!(&aut_findfos(&ns, hay), &matches);
+    }
+
+    #[test]
+    fn many_longer_pattern_overlap_many_match_reverse() {
+        let ns = vec!["abc", "bc", "c"];
+        let hay = "zzzczzzbczzzabc";
+        let matches = vec![
+            Match { pati: 2, start: 3, end: 4 },
+            Match { pati: 1, start: 7, end: 9 },
+            Match { pati: 2, start: 8, end: 9 },
+            Match { pati: 0, start: 12, end: 15 },
+            Match { pati: 1, start: 13, end: 15 },
+            Match { pati: 2, start: 14, end: 15 },
+        ];
+        assert_eq!(&aut_findo(&ns, hay), &matches);
+        assert_eq!(&aut_findos(&ns, hay), &matches);
+        assert_eq!(&aut_findfo(&ns, hay), &matches);
+        assert_eq!(&aut_findfos(&ns, hay), &matches);
+    }
+
+    // Quickcheck time.
+
+    // This generates very small ascii strings, which makes them more likely
+    // to interact in interesting ways with larger haystack strings.
+    #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
+    pub struct SmallAscii(String);
+
+    impl Arbitrary for SmallAscii {
+        fn arbitrary<G: Gen>(g: &mut G) -> SmallAscii {
+            use std::char::from_u32;
+            SmallAscii((0..2)
+                       .map(|_| from_u32(g.gen_range(97, 123)).unwrap())
+                       .collect())
+        }
+
+        fn shrink(&self) -> Box<Iterator<Item=SmallAscii>> {
+            Box::new(self.0.shrink().map(SmallAscii))
+        }
+    }
+
+    impl From<SmallAscii> for String {
+        fn from(s: SmallAscii) -> String { s.0 }
+    }
+
+    // This is the same arbitrary impl as `String`, except it has a bias toward
+    // ASCII characters.
+    #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
+    pub struct BiasAscii(String);
+
+    impl Arbitrary for BiasAscii {
+        fn arbitrary<G: Gen>(g: &mut G) -> BiasAscii {
+            use std::char::from_u32;
+            let size = { let s = g.size(); g.gen_range(0, s) };
+            let mut s = String::with_capacity(size);
+            for _ in 0..size {
+                if g.gen_weighted_bool(3) {
+                    s.push(char::arbitrary(g));
+                } else {
+                    for _ in 0..5 {
+                        s.push(from_u32(g.gen_range(97, 123)).unwrap());
+                    }
+                }
+            }
+            BiasAscii(s)
+        }
+
+        fn shrink(&self) -> Box<Iterator<Item=BiasAscii>> {
+            Box::new(self.0.shrink().map(BiasAscii))
+        }
+    }
+
+    fn naive_find<S>(xs: &[S], haystack: &str) -> Vec<Match>
+            where S: Clone + Into<String> {
+        let needles: Vec<String> =
+            xs.to_vec().into_iter().map(Into::into).collect();
+        let mut matches = vec![];
+        for hi in 0..haystack.len() {
+            for (pati, needle) in needles.iter().enumerate() {
+                let needle = needle.as_bytes();
+                if needle.len() == 0 || needle.len() > haystack.len() - hi {
+                    continue;
+                }
+                if needle == &haystack.as_bytes()[hi..hi+needle.len()] {
+                    matches.push(Match {
+                        pati: pati,
+                        start: hi,
+                        end: hi + needle.len(),
+                    });
+                }
+            }
+        }
+        matches
+    }
+
+    #[test]
+    fn qc_ac_equals_naive() {
+        fn prop(needles: Vec<SmallAscii>, haystack: BiasAscii) -> bool {
+            let aut_matches = aut_findo(&needles, &haystack.0);
+            let naive_matches = naive_find(&needles, &haystack.0);
+            // Ordering isn't always the same. I don't think we care, so do
+            // an unordered comparison.
+            let aset: HashSet<Match> = aut_matches.iter().cloned().collect();
+            let nset: HashSet<Match> = naive_matches.iter().cloned().collect();
+            aset == nset
+        }
+        quickcheck(prop as fn(Vec<SmallAscii>, BiasAscii) -> bool);
+    }
+}
+
+
+ + + + + + + + + + + + + + + \ No newline at end of file -- cgit v1.2.3