From 64106c4d3d4ddba8c7bc2af75376e6d3d3d75601 Mon Sep 17 00:00:00 2001 From: Date: Mon, 29 Jun 2015 20:16:15 +0000 Subject: Update documentation --- src/regex/program.rs.html | 1057 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1057 insertions(+) create mode 100644 src/regex/program.rs.html (limited to 'src/regex/program.rs.html') diff --git a/src/regex/program.rs.html b/src/regex/program.rs.html new file mode 100644 index 0000000..8f6ed02 --- /dev/null +++ b/src/regex/program.rs.html @@ -0,0 +1,1057 @@ + + + + + + + + + + program.rs.html -- source + + + + + + + + + + + + + + + +
  1
+  2
+  3
+  4
+  5
+  6
+  7
+  8
+  9
+ 10
+ 11
+ 12
+ 13
+ 14
+ 15
+ 16
+ 17
+ 18
+ 19
+ 20
+ 21
+ 22
+ 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
+222
+223
+224
+225
+226
+227
+228
+229
+230
+231
+232
+233
+234
+235
+236
+237
+238
+239
+240
+241
+242
+243
+244
+245
+246
+247
+248
+249
+250
+251
+252
+253
+254
+255
+256
+257
+258
+259
+260
+261
+262
+263
+264
+265
+266
+267
+268
+269
+270
+271
+272
+273
+274
+275
+276
+277
+278
+279
+280
+281
+282
+283
+284
+285
+286
+287
+288
+289
+290
+291
+292
+293
+294
+295
+296
+297
+298
+299
+300
+301
+302
+303
+304
+305
+306
+307
+308
+309
+310
+311
+312
+313
+314
+315
+316
+317
+318
+319
+320
+321
+322
+323
+324
+325
+326
+327
+328
+329
+330
+331
+332
+333
+334
+335
+336
+337
+338
+339
+340
+341
+342
+343
+344
+345
+346
+347
+348
+349
+350
+351
+352
+353
+354
+355
+356
+357
+358
+359
+360
+361
+362
+363
+364
+365
+366
+367
+368
+369
+370
+371
+372
+373
+374
+375
+376
+377
+378
+379
+380
+381
+382
+383
+384
+385
+386
+387
+388
+389
+390
+391
+392
+393
+394
+395
+396
+397
+398
+399
+400
+401
+402
+403
+404
+405
+406
+407
+408
+409
+410
+411
+412
+413
+414
+415
+416
+417
+418
+419
+420
+421
+422
+423
+424
+425
+426
+427
+428
+429
+430
+431
+432
+433
+434
+435
+436
+437
+438
+439
+440
+441
+442
+443
+444
+445
+446
+447
+448
+449
+450
+451
+452
+453
+454
+455
+456
+457
+458
+459
+460
+461
+462
+463
+464
+465
+466
+467
+468
+469
+470
+471
+472
+473
+474
+475
+476
+477
+478
+479
+480
+
+// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+use std::cmp::{self, Ordering};
+
+use syntax;
+
+use Error;
+use backtrack::{Backtrack, BackMachine};
+use char::Char;
+use compile::Compiler;
+use nfa::{Nfa, NfaThreads};
+use pool::Pool;
+use prefix::Prefix;
+use re::CaptureIdxs;
+
+const NUM_PREFIX_LIMIT: usize = 30;
+const PREFIX_LENGTH_LIMIT: usize = 15;
+
+pub type InstIdx = usize;
+
+/// An instruction, the underlying unit of a compiled regular expression
+#[derive(Clone, Debug)]
+pub enum Inst {
+    /// A match has occurred.
+    /// This is always the last instruction and only occurs in a single spot.
+    /// We could special case this in the code, but it is much clearer to
+    /// handle it as a proper instruction.
+    Match,
+    /// Save the current location in the input into the given capture location.
+    Save(usize),
+    /// Jump to the instruction given.
+    Jump(InstIdx),
+    /// Match either instruction, preferring the first.
+    Split(InstIdx, InstIdx),
+    /// A zero-width instruction. When this instruction matches, the input
+    /// is not advanced.
+    EmptyLook(LookInst),
+    /// Match a single possibly case insensitive character.
+    Char(OneChar),
+    /// Match one or more possibly case insensitive character ranges.
+    Ranges(CharRanges),
+}
+
+/// A single character instruction.
+#[derive(Clone, Debug)]
+pub struct OneChar {
+    /// The character.
+    pub c: char,
+    /// True if the character should be matched case insensitively.
+    /// (i.e., The input character will need to be case folded.)
+    pub casei: bool,
+}
+
+/// A multi-range character class instruction.
+#[derive(Clone, Debug)]
+pub struct CharRanges {
+    /// Sorted sequence of non-overlapping ranges.
+    pub ranges: Vec<(char, char)>,
+    /// Whether to match case insensitively.
+    pub casei: bool,
+}
+
+/// The set of zero-width match instructions.
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub enum LookInst {
+    /// Start of line or input.
+    StartLine,
+    /// End of line or input.
+    EndLine,
+    /// Start of input.
+    StartText,
+    /// End of input.
+    EndText,
+    /// Word character on one side and non-word character on other.
+    WordBoundary,
+    /// Word character on both sides or non-word character on both sides.
+    NotWordBoundary,
+}
+
+impl OneChar {
+    /// Tests whether the given input character matches this instruction.
+    #[inline(always)] // About ~5-15% more throughput then `#[inline]`
+    pub fn matches(&self, c: Char) -> bool {
+        self.c == c || (self.casei && self.c == c.case_fold())
+    }
+}
+
+impl CharRanges {
+    /// Emits a range specifically for the `.` expression.
+    pub fn any() -> CharRanges {
+        CharRanges {
+            ranges: vec![('\x00', '\u{10ffff}')],
+            casei: false,
+        }
+    }
+
+    /// Emits a range specifically for the `(?s).` expression.
+    pub fn any_nonl() -> CharRanges {
+        CharRanges {
+            ranges: vec![('\x00', '\x09'), ('\x0B', '\u{10ffff}')],
+            casei: false,
+        }
+    }
+
+    /// Emits a range from the AST character class.
+    pub fn from_class(cls: syntax::CharClass) -> CharRanges {
+        let casei = cls.is_case_insensitive();
+        CharRanges {
+            ranges: cls.into_iter().map(|r| (r.start, r.end)).collect(),
+            casei: casei,
+        }
+    }
+
+    /// Tests whether the given input character matches this instruction.
+    #[inline(always)] // About ~5-15% more throughput then `#[inline]`
+    pub fn matches(&self, mut c: Char) -> Option<usize> {
+        if self.casei {
+            c = c.case_fold();
+        }
+        // This speeds up the `match_class_unicode` benchmark by checking
+        // some common cases quickly without binary search. e.g., Matching
+        // a Unicode class on predominantly ASCII text.
+        for i in 0..cmp::min(self.ranges.len(), 4) {
+            let r = self.ranges[i];
+            if c < r.0 {
+                return None;
+            }
+            if c <= r.1 {
+                return Some(i);
+            }
+        }
+        self.ranges.binary_search_by(|r| {
+            if r.1 < c {
+                Ordering::Less
+            } else if r.0 > c {
+                Ordering::Greater
+            } else {
+                Ordering::Equal
+            }
+        }).ok()
+    }
+}
+
+impl LookInst {
+    /// Tests whether the pair of characters matches this zero-width
+    /// instruction.
+    pub fn matches(&self, c1: Char, c2: Char) -> bool {
+        use self::LookInst::*;
+        match *self {
+            StartLine => c1.is_none() || c1 == '\n',
+            EndLine => c2.is_none() || c2 == '\n',
+            StartText => c1.is_none(),
+            EndText => c2.is_none(),
+            ref wbty => {
+                let (w1, w2) = (c1.is_word_char(), c2.is_word_char());
+                (*wbty == WordBoundary && w1 ^ w2)
+                || (*wbty == NotWordBoundary && !(w1 ^ w2))
+            }
+        }
+    }
+}
+
+/// The matching engines offered by this regex implementation.
+///
+/// N.B. This is exported for use in testing.
+#[doc(hidden)]
+#[derive(Clone, Copy, Debug)]
+pub enum MatchEngine {
+    /// A bounded backtracking implementation. About twice as fast as the
+    /// NFA, but can only work on small regexes and small input.
+    Backtrack,
+    /// A full NFA simulation. Can always be employed but almost always the
+    /// slowest choice.
+    Nfa,
+    /// If the entire regex is a literal and no capture groups have been
+    /// requested, then we can degrade to a simple substring match.
+    Literals,
+}
+
+/// Program represents a compiled regular expression. Once an expression is
+/// compiled, its representation is immutable and will never change.
+/// (Well, almost. In fact, the matching engines cache state that can be
+/// reused on subsequent searches. But this is interior mutability that
+/// shouldn't be observable by the caller.)
+#[derive(Debug)]
+pub struct Program {
+    /// The original regular expression string.
+    pub original: String,
+    /// A sequence of instructions.
+    pub insts: Vec<Inst>,
+    /// The sequence of capture group names. There is an entry for each capture
+    /// group index and a name exists only if the capture group is named.
+    pub cap_names: Vec<Option<String>>,
+    /// If the regular expression requires a literal prefix in order to have a
+    /// match, that prefix is stored here as a DFA.
+    pub prefixes: Prefix,
+    /// True iff matching any literal prefix indicates a match.
+    pub prefixes_complete: bool,
+    /// True iff program is anchored at the beginning.
+    pub anchored_begin: bool,
+    /// True iff program is anchored at the end.
+    pub anchored_end: bool,
+    /// The type of matching engine to use.
+    /// When `None` (the default), pick an engine automatically.
+    pub engine: Option<MatchEngine>,
+    /// Cached NFA threads.
+    pub nfa_threads: Pool<NfaThreads>,
+    /// Cached backtracking memory.
+    pub backtrack: Pool<BackMachine>,
+}
+
+impl Program {
+    /// Compiles a Regex.
+    pub fn new(
+        engine: Option<MatchEngine>,
+        size_limit: usize,
+        re: &str,
+    ) -> Result<Program, Error> {
+        let expr = try!(syntax::Expr::parse(re));
+        let (insts, cap_names) = try!(Compiler::new(size_limit).compile(expr));
+        let (insts_len, ncaps) = (insts.len(), num_captures(&insts));
+        let create_threads = move || NfaThreads::new(insts_len, ncaps);
+        let create_backtrack = move || BackMachine::new();
+        let mut prog = Program {
+            original: re.into(),
+            insts: insts,
+            cap_names: cap_names,
+            prefixes: Prefix::Empty,
+            prefixes_complete: false,
+            anchored_begin: false,
+            anchored_end: false,
+            engine: engine,
+            nfa_threads: Pool::new(Box::new(create_threads)),
+            backtrack: Pool::new(Box::new(create_backtrack)),
+        };
+
+        prog.find_prefixes();
+        prog.anchored_begin = match prog.insts[1] {
+            Inst::EmptyLook(LookInst::StartText) => true,
+            _ => false,
+        };
+        prog.anchored_end = match prog.insts[prog.insts.len() - 3] {
+            Inst::EmptyLook(LookInst::EndText) => true,
+            _ => false,
+        };
+        Ok(prog)
+    }
+
+    /// Executes a compiled regex program.
+    pub fn exec(
+        &self,
+        caps: &mut CaptureIdxs,
+        text: &str,
+        start: usize,
+    ) -> bool {
+        match self.choose_engine(caps.len(), text) {
+            MatchEngine::Backtrack => Backtrack::exec(self, caps, text, start),
+            MatchEngine::Nfa => Nfa::exec(self, caps, text, start),
+            MatchEngine::Literals => {
+                match self.prefixes.find(&text[start..]) {
+                    None => false,
+                    Some((s, e)) => {
+                        if caps.len() == 2 {
+                            caps[0] = Some(start + s);
+                            caps[1] = Some(start + e);
+                        }
+                        true
+                    }
+                }
+            }
+        }
+    }
+
+    fn choose_engine(&self, cap_len: usize, text: &str) -> MatchEngine {
+        // If the engine is already chosen, then we use it.
+        // But that might not be a good idea. e.g., What if `Literals` is
+        // chosen and it can't work? I guess we should probably check whether
+        // the chosen engine is appropriate or not.
+        self.engine.unwrap_or_else(|| {
+            if cap_len <= 2
+               && self.prefixes.preserves_priority()
+               && self.prefixes_complete {
+                MatchEngine::Literals
+            } else if Backtrack::should_exec(self, text) {
+                // We're only here if the input and regex combined are small.
+                MatchEngine::Backtrack
+            } else {
+                MatchEngine::Nfa
+            }
+        })
+    }
+
+    /// Returns the total number of capture groups in the regular expression.
+    /// This includes the zeroth capture.
+    pub fn num_captures(&self) -> usize {
+        num_captures(&self.insts)
+    }
+
+    /// Allocate new capture groups.
+    pub fn alloc_captures(&self) -> Vec<Option<usize>> {
+        vec![None; 2 * self.num_captures()]
+    }
+
+    /// Find and store a prefix machine for the current program.
+    pub fn find_prefixes(&mut self) {
+        use self::Inst::*;
+
+        let (ps, complete) = self.prefixes_from_insts(1);
+        if ps.len() > 0 {
+            self.prefixes = Prefix::new(ps);
+            self.prefixes_complete = complete;
+            return;
+        }
+        let mut pc = 1;
+        let mut prefixes = vec![];
+        let mut pcomplete = true;
+        while let Split(x, y) = self.insts[pc] {
+            let (xps, xcomplete) = self.prefixes_from_insts(x);
+            let (yps, ycomplete) = self.prefixes_from_insts(y);
+            let mut done = false;
+            match (&self.insts[x], &self.insts[y]) {
+                // We should be able to support this. Add explicit stack. ---AG
+                (&Split(_, _), &Split(_, _)) => return,
+                (_, &Split(_, _)) if xps.len() == 0 => return,
+                (_, &Split(_, _)) => {
+                    pcomplete = pcomplete && xcomplete;
+                    prefixes.extend(xps);
+                    pc = y;
+                }
+                (&Split(_, _), _) if yps.len() == 0 => return,
+                (&Split(_, _), _) => {
+                    pcomplete = pcomplete && ycomplete;
+                    prefixes.extend(yps);
+                    pc = x;
+                }
+                _ if xps.len() == 0 || yps.len() == 0 => return,
+                // This is our base case. We've followed splits the whole
+                // way, which means both instructions lead to a match.
+                _ => {
+                    pcomplete = pcomplete && xcomplete && ycomplete;
+                    prefixes.extend(xps);
+                    prefixes.extend(yps);
+                    done = true;
+                }
+            }
+            // Arg. We've over-extended ourselves, quit with nothing to
+            // show for it.
+            if prefixes.len() > NUM_PREFIX_LIMIT {
+                return;
+            }
+            if done { break; }
+        }
+        self.prefixes = Prefix::new(prefixes);
+        self.prefixes_complete = pcomplete && self.prefixes.len() > 0;
+    }
+
+    /// Find a prefix starting at the given instruction.
+    ///
+    /// Returns `true` in the tuple if the end of the prefix leads trivially
+    /// to a match. (This may report false negatives, but being conservative
+    /// is OK.)
+    fn prefixes_from_insts(&self, mut pc: usize) -> (Vec<String>, bool) {
+        use self::Inst::*;
+
+        let mut complete = true;
+        let mut alts = vec![String::new()];
+        while pc < self.insts.len() {
+            let inst = &self.insts[pc];
+
+            // Each iteration adds one character to every alternate prefix *or*
+            // it stops. Thus, the prefix alternates grow in lock step, and it
+            // suffices to check one of them to see if the prefix limit has been
+            // exceeded.
+            if alts[0].len() > PREFIX_LENGTH_LIMIT {
+                complete = false;
+                break;
+            }
+            match *inst {
+                Save(_) => { pc += 1; continue } // completely ignore it
+                Char(OneChar { c, casei: false }) => {
+                    for alt in &mut alts {
+                        alt.push(c);
+                    }
+                    pc += 1;
+                }
+                Ranges(CharRanges { ref ranges, casei: false }) => {
+                    let nchars = num_chars_in_ranges(ranges);
+                    if alts.len() * nchars > NUM_PREFIX_LIMIT {
+                        complete = false;
+                        break;
+                    }
+
+                    let orig = alts;
+                    alts = Vec::with_capacity(orig.len());
+                    for &(s, e) in ranges {
+                        for c in (s as u32)..(e as u32 + 1){
+                            for alt in &orig {
+                                let mut alt = alt.clone();
+                                alt.push(::std::char::from_u32(c).unwrap());
+                                alts.push(alt);
+                            }
+                        }
+                    }
+                    pc += 1;
+                }
+                Jump(pc2) => pc = pc2,
+                _ => { complete = self.leads_to_match(pc); break }
+            }
+        }
+        if alts[0].len() == 0 {
+            (vec![], false)
+        } else {
+            (alts, complete)
+        }
+    }
+
+    fn leads_to_match(&self, mut pc: usize) -> bool {
+        // I'm pretty sure this is conservative, so it might have some
+        // false negatives.
+        loop {
+            match self.insts[pc] {
+                Inst::Match => return true,
+                Inst::Save(_) => pc += 1,
+                Inst::Jump(pc2) => pc = pc2,
+                _ => return false,
+            }
+        }
+    }
+}
+
+impl Clone for Program {
+    fn clone(&self) -> Program {
+        let (insts_len, ncaps) = (self.insts.len(), self.num_captures());
+        let create_threads = move || NfaThreads::new(insts_len, ncaps);
+        let create_backtrack = move || BackMachine::new();
+        Program {
+            original: self.original.clone(),
+            insts: self.insts.clone(),
+            cap_names: self.cap_names.clone(),
+            prefixes: self.prefixes.clone(),
+            prefixes_complete: self.prefixes_complete,
+            anchored_begin: self.anchored_begin,
+            anchored_end: self.anchored_end,
+            engine: self.engine,
+            nfa_threads: Pool::new(Box::new(create_threads)),
+            backtrack: Pool::new(Box::new(create_backtrack)),
+        }
+    }
+}
+
+/// Return the number of captures in the given sequence of instructions.
+fn num_captures(insts: &[Inst]) -> usize {
+    let mut n = 0;
+    for inst in insts {
+        match *inst {
+            Inst::Save(c) => n = cmp::max(n, c+1),
+            _ => {}
+        }
+    }
+    // There's exactly 2 Save slots for every capture.
+    n / 2
+}
+
+/// Count the number of characters in the given range.
+///
+/// This is useful for pre-emptively limiting the number of prefix literals
+/// we extract from a regex program.
+fn num_chars_in_ranges(ranges: &[(char, char)]) -> usize {
+    ranges.iter()
+          .map(|&(s, e)| (e as u32) - (s as u32))
+          .fold(0, |acc, len| acc + len) as usize
+}
+
+
+ + + + + + + + + + + + + + + \ No newline at end of file -- cgit v1.2.3