// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

use std::cmp::{self, Ordering};

use syntax;

use Error;
use backtrack::{Backtrack, BackMachine};
use char::Char;
use compile::Compiler;
use nfa::{Nfa, NfaThreads};
use pool::Pool;
use prefix::Prefix;
use re::CaptureIdxs;

const NUM_PREFIX_LIMIT: usize = 30;
const PREFIX_LENGTH_LIMIT: usize = 15;

pub type InstIdx = usize;

/// An instruction, the underlying unit of a compiled regular expression
#[derive(Clone, Debug)]
pub enum Inst {
    /// A match has occurred.
    /// This is always the last instruction and only occurs in a single spot.
    /// We could special case this in the code, but it is much clearer to
    /// handle it as a proper instruction.
    Match,
    /// Save the current location in the input into the given capture location.
    Save(usize),
    /// Jump to the instruction given.
    Jump(InstIdx),
    /// Match either instruction, preferring the first.
    Split(InstIdx, InstIdx),
    /// A zero-width instruction. When this instruction matches, the input
    /// is not advanced.
    EmptyLook(LookInst),
    /// Match a single possibly case insensitive character.
    Char(OneChar),
    /// Match one or more possibly case insensitive character ranges.
    Ranges(CharRanges),
}

/// A single character instruction.
#[derive(Clone, Debug)]
pub struct OneChar {
    /// The character.
    pub c: char,
    /// True if the character should be matched case insensitively.
    /// (i.e., The input character will need to be case folded.)
    pub casei: bool,
}

/// A multi-range character class instruction.
#[derive(Clone, Debug)]
pub struct CharRanges {
    /// Sorted sequence of non-overlapping ranges.
    pub ranges: Vec<(char, char)>,
    /// Whether to match case insensitively.
    pub casei: bool,
}

/// The set of zero-width match instructions.
#[derive(Clone, Debug, PartialEq, Eq)]
pub enum LookInst {
    /// Start of line or input.
    StartLine,
    /// End of line or input.
    EndLine,
    /// Start of input.
    StartText,
    /// End of input.
    EndText,
    /// Word character on one side and non-word character on other.
    WordBoundary,
    /// Word character on both sides or non-word character on both sides.
    NotWordBoundary,
}

impl OneChar {
    /// Tests whether the given input character matches this instruction.
    #[inline(always)] // About ~5-15% more throughput then `#[inline]`
    pub fn matches(&self, c: Char) -> bool {
        self.c == c || (self.casei && self.c == c.case_fold())
    }
}

impl CharRanges {
    /// Emits a range specifically for the `.` expression.
    pub fn any() -> CharRanges {
        CharRanges {
            ranges: vec![('\x00', '\u{10ffff}')],
            casei: false,
        }
    }

    /// Emits a range specifically for the `(?s).` expression.
    pub fn any_nonl() -> CharRanges {
        CharRanges {
            ranges: vec![('\x00', '\x09'), ('\x0B', '\u{10ffff}')],
            casei: false,
        }
    }

    /// Emits a range from the AST character class.
    pub fn from_class(cls: syntax::CharClass) -> CharRanges {
        let casei = cls.is_case_insensitive();
        CharRanges {
            ranges: cls.into_iter().map(|r| (r.start, r.end)).collect(),
            casei: casei,
        }
    }

    /// Tests whether the given input character matches this instruction.
    #[inline(always)] // About ~5-15% more throughput then `#[inline]`
    pub fn matches(&self, mut c: Char) -> Option<usize> {
        if self.casei {
            c = c.case_fold();
        }
        // This speeds up the `match_class_unicode` benchmark by checking
        // some common cases quickly without binary search. e.g., Matching
        // a Unicode class on predominantly ASCII text.
        for i in 0..cmp::min(self.ranges.len(), 4) {
            let r = self.ranges[i];
            if c < r.0 {
                return None;
            }
            if c <= r.1 {
                return Some(i);
            }
        }
        self.ranges.binary_search_by(|r| {
            if r.1 < c {
                Ordering::Less
            } else if r.0 > c {
                Ordering::Greater
            } else {
                Ordering::Equal
            }
        }).ok()
    }
}

impl LookInst {
    /// Tests whether the pair of characters matches this zero-width
    /// instruction.
    pub fn matches(&self, c1: Char, c2: Char) -> bool {
        use self::LookInst::*;
        match *self {
            StartLine => c1.is_none() || c1 == '\n',
            EndLine => c2.is_none() || c2 == '\n',
            StartText => c1.is_none(),
            EndText => c2.is_none(),
            ref wbty => {
                let (w1, w2) = (c1.is_word_char(), c2.is_word_char());
                (*wbty == WordBoundary && w1 ^ w2)
                || (*wbty == NotWordBoundary && !(w1 ^ w2))
            }
        }
    }
}

/// The matching engines offered by this regex implementation.
///
/// N.B. This is exported for use in testing.
#[doc(hidden)]
#[derive(Clone, Copy, Debug)]
pub enum MatchEngine {
    /// A bounded backtracking implementation. About twice as fast as the
    /// NFA, but can only work on small regexes and small input.
    Backtrack,
    /// A full NFA simulation. Can always be employed but almost always the
    /// slowest choice.
    Nfa,
    /// If the entire regex is a literal and no capture groups have been
    /// requested, then we can degrade to a simple substring match.
    Literals,
}

/// Program represents a compiled regular expression. Once an expression is
/// compiled, its representation is immutable and will never change.
/// (Well, almost. In fact, the matching engines cache state that can be
/// reused on subsequent searches. But this is interior mutability that
/// shouldn't be observable by the caller.)
#[derive(Debug)]
pub struct Program {
    /// The original regular expression string.
    pub original: String,
    /// A sequence of instructions.
    pub insts: Vec<Inst>,
    /// The sequence of capture group names. There is an entry for each capture
    /// group index and a name exists only if the capture group is named.
    pub cap_names: Vec<Option<String>>,
    /// If the regular expression requires a literal prefix in order to have a
    /// match, that prefix is stored here as a DFA.
    pub prefixes: Prefix,
    /// True iff matching any literal prefix indicates a match.
    pub prefixes_complete: bool,
    /// True iff program is anchored at the beginning.
    pub anchored_begin: bool,
    /// True iff program is anchored at the end.
    pub anchored_end: bool,
    /// The type of matching engine to use.
    /// When `None` (the default), pick an engine automatically.
    pub engine: Option<MatchEngine>,
    /// Cached NFA threads.
    pub nfa_threads: Pool<NfaThreads>,
    /// Cached backtracking memory.
    pub backtrack: Pool<BackMachine>,
}

impl Program {
    /// Compiles a Regex.
    pub fn new(
        engine: Option<MatchEngine>,
        size_limit: usize,
        re: &str,
    ) -> Result<Program, Error> {
        let expr = try!(syntax::Expr::parse(re));
        let (insts, cap_names) = try!(Compiler::new(size_limit).compile(expr));
        let (insts_len, ncaps) = (insts.len(), num_captures(&insts));
        let create_threads = move || NfaThreads::new(insts_len, ncaps);
        let create_backtrack = move || BackMachine::new();
        let mut prog = Program {
            original: re.into(),
            insts: insts,
            cap_names: cap_names,
            prefixes: Prefix::Empty,
            prefixes_complete: false,
            anchored_begin: false,
            anchored_end: false,
            engine: engine,
            nfa_threads: Pool::new(Box::new(create_threads)),
            backtrack: Pool::new(Box::new(create_backtrack)),
        };

        prog.find_prefixes();
        prog.anchored_begin = match prog.insts[1] {
            Inst::EmptyLook(LookInst::StartText) => true,
            _ => false,
        };
        prog.anchored_end = match prog.insts[prog.insts.len() - 3] {
            Inst::EmptyLook(LookInst::EndText) => true,
            _ => false,
        };
        Ok(prog)
    }

    /// Executes a compiled regex program.
    pub fn exec(
        &self,
        caps: &mut CaptureIdxs,
        text: &str,
        start: usize,
    ) -> bool {
        match self.choose_engine(caps.len(), text) {
            MatchEngine::Backtrack => Backtrack::exec(self, caps, text, start),
            MatchEngine::Nfa => Nfa::exec(self, caps, text, start),
            MatchEngine::Literals => {
                match self.prefixes.find(&text[start..]) {
                    None => false,
                    Some((s, e)) => {
                        if caps.len() == 2 {
                            caps[0] = Some(start + s);
                            caps[1] = Some(start + e);
                        }
                        true
                    }
                }
            }
        }
    }

    fn choose_engine(&self, cap_len: usize, text: &str) -> MatchEngine {
        // If the engine is already chosen, then we use it.
        // But that might not be a good idea. e.g., What if `Literals` is
        // chosen and it can't work? I guess we should probably check whether
        // the chosen engine is appropriate or not.
        self.engine.unwrap_or_else(|| {
            if cap_len <= 2
               && self.prefixes.preserves_priority()
               && self.prefixes_complete {
                MatchEngine::Literals
            } else if Backtrack::should_exec(self, text) {
                // We're only here if the input and regex combined are small.
                MatchEngine::Backtrack
            } else {
                MatchEngine::Nfa
            }
        })
    }

    /// Returns the total number of capture groups in the regular expression.
    /// This includes the zeroth capture.
    pub fn num_captures(&self) -> usize {
        num_captures(&self.insts)
    }

    /// Allocate new capture groups.
    pub fn alloc_captures(&self) -> Vec<Option<usize>> {
        vec![None; 2 * self.num_captures()]
    }

    /// Find and store a prefix machine for the current program.
    pub fn find_prefixes(&mut self) {
        use self::Inst::*;

        let (ps, complete) = self.prefixes_from_insts(1);
        if ps.len() > 0 {
            self.prefixes = Prefix::new(ps);
            self.prefixes_complete = complete;
            return;
        }
        let mut pc = 1;
        let mut prefixes = vec![];
        let mut pcomplete = true;
        while let Split(x, y) = self.insts[pc] {
            let (xps, xcomplete) = self.prefixes_from_insts(x);
            let (yps, ycomplete) = self.prefixes_from_insts(y);
            let mut done = false;
            match (&self.insts[x], &self.insts[y]) {
                // We should be able to support this. Add explicit stack. ---AG
                (&Split(_, _), &Split(_, _)) => return,
                (_, &Split(_, _)) if xps.len() == 0 => return,
                (_, &Split(_, _)) => {
                    pcomplete = pcomplete && xcomplete;
                    prefixes.extend(xps);
                    pc = y;
                }
                (&Split(_, _), _) if yps.len() == 0 => return,
                (&Split(_, _), _) => {
                    pcomplete = pcomplete && ycomplete;
                    prefixes.extend(yps);
                    pc = x;
                }
                _ if xps.len() == 0 || yps.len() == 0 => return,
                // This is our base case. We've followed splits the whole
                // way, which means both instructions lead to a match.
                _ => {
                    pcomplete = pcomplete && xcomplete && ycomplete;
                    prefixes.extend(xps);
                    prefixes.extend(yps);
                    done = true;
                }
            }
            // Arg. We've over-extended ourselves, quit with nothing to
            // show for it.
            if prefixes.len() > NUM_PREFIX_LIMIT {
                return;
            }
            if done { break; }
        }
        self.prefixes = Prefix::new(prefixes);
        self.prefixes_complete = pcomplete && self.prefixes.len() > 0;
    }

    /// Find a prefix starting at the given instruction.
    ///
    /// Returns `true` in the tuple if the end of the prefix leads trivially
    /// to a match. (This may report false negatives, but being conservative
    /// is OK.)
    fn prefixes_from_insts(&self, mut pc: usize) -> (Vec<String>, bool) {
        use self::Inst::*;

        let mut complete = true;
        let mut alts = vec![String::new()];
        while pc < self.insts.len() {
            let inst = &self.insts[pc];

            // Each iteration adds one character to every alternate prefix *or*
            // it stops. Thus, the prefix alternates grow in lock step, and it
            // suffices to check one of them to see if the prefix limit has been
            // exceeded.
            if alts[0].len() > PREFIX_LENGTH_LIMIT {
                complete = false;
                break;
            }
            match *inst {
                Save(_) => { pc += 1; continue } // completely ignore it
                Char(OneChar { c, casei: false }) => {
                    for alt in &mut alts {
                        alt.push(c);
                    }
                    pc += 1;
                }
                Ranges(CharRanges { ref ranges, casei: false }) => {
                    let nchars = num_chars_in_ranges(ranges);
                    if alts.len() * nchars > NUM_PREFIX_LIMIT {
                        complete = false;
                        break;
                    }

                    let orig = alts;
                    alts = Vec::with_capacity(orig.len());
                    for &(s, e) in ranges {
                        for c in (s as u32)..(e as u32 + 1){
                            for alt in &orig {
                                let mut alt = alt.clone();
                                alt.push(::std::char::from_u32(c).unwrap());
                                alts.push(alt);
                            }
                        }
                    }
                    pc += 1;
                }
                Jump(pc2) => pc = pc2,
                _ => { complete = self.leads_to_match(pc); break }
            }
        }
        if alts[0].len() == 0 {
            (vec![], false)
        } else {
            (alts, complete)
        }
    }

    fn leads_to_match(&self, mut pc: usize) -> bool {
        // I'm pretty sure this is conservative, so it might have some
        // false negatives.
        loop {
            match self.insts[pc] {
                Inst::Match => return true,
                Inst::Save(_) => pc += 1,
                Inst::Jump(pc2) => pc = pc2,
                _ => return false,
            }
        }
    }
}

impl Clone for Program {
    fn clone(&self) -> Program {
        let (insts_len, ncaps) = (self.insts.len(), self.num_captures());
        let create_threads = move || NfaThreads::new(insts_len, ncaps);
        let create_backtrack = move || BackMachine::new();
        Program {
            original: self.original.clone(),
            insts: self.insts.clone(),
            cap_names: self.cap_names.clone(),
            prefixes: self.prefixes.clone(),
            prefixes_complete: self.prefixes_complete,
            anchored_begin: self.anchored_begin,
            anchored_end: self.anchored_end,
            engine: self.engine,
            nfa_threads: Pool::new(Box::new(create_threads)),
            backtrack: Pool::new(Box::new(create_backtrack)),
        }
    }
}

/// Return the number of captures in the given sequence of instructions.
fn num_captures(insts: &[Inst]) -> usize {
    let mut n = 0;
    for inst in insts {
        match *inst {
            Inst::Save(c) => n = cmp::max(n, c+1),
            _ => {}
        }
    }
    // There's exactly 2 Save slots for every capture.
    n / 2
}

/// Count the number of characters in the given range.
///
/// This is useful for pre-emptively limiting the number of prefix literals
/// we extract from a regex program.
fn num_chars_in_ranges(ranges: &[(char, char)]) -> usize {
    ranges.iter()
          .map(|&(s, e)| (e as u32) - (s as u32))
          .fold(0, |acc, len| acc + len) as usize
}
Keyboard shortcuts

Search tricks