// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

/*!
This crate provides a regular expression parser and an abstract syntax for
regular expressions. The abstract syntax is defined by the `Expr` type. The
concrete syntax is enumerated in the
[`regex`](../regex/index.html#syntax)
crate documentation.

Note that since this crate is first and foremost an implementation detail for
the `regex` crate, it may experience more frequent breaking changes. It is
exposed as a separate crate so that others may use it to do analysis on regular
expressions or even build their own matching engine.

# Example: parsing an expression

Parsing a regular expression can be done with the `Expr::parse` function.

```rust
use regex_syntax::Expr;

assert_eq!(Expr::parse(r"ab|yz").unwrap(), Expr::Alternate(vec![
    Expr::Literal { chars: vec!['a', 'b'], casei: false },
    Expr::Literal { chars: vec!['y', 'z'], casei: false },
]));
```

# Example: inspecting an error

The parser in this crate provides very detailed error values. For example,
if an invalid character class range is given:

```rust
use regex_syntax::{Expr, ErrorKind};

let err = Expr::parse(r"[z-a]").unwrap_err();
assert_eq!(err.position(), 4);
assert_eq!(err.kind(), &ErrorKind::InvalidClassRange {
    start: 'z',
    end: 'a',
});
```

Or unbalanced parentheses:

```rust
use regex_syntax::{Expr, ErrorKind};

let err = Expr::parse(r"ab(cd").unwrap_err();
assert_eq!(err.position(), 2);
assert_eq!(err.kind(), &ErrorKind::UnclosedParen);
```
*/

#![deny(missing_docs)]

#[cfg(test)] extern crate quickcheck;
#[cfg(test)] extern crate rand;

mod parser;
mod unicode;

use std::char;
use std::cmp::{Ordering, max, min};
use std::fmt;
use std::iter::IntoIterator;
use std::ops::Deref;
use std::slice;
use std::vec;

use unicode::case_folding;

use self::Expr::*;
use self::Repeater::*;

pub use parser::is_punct;

/// A regular expression abstract syntax tree.
///
/// An `Expr` represents the abstract syntax of a regular expression.
#[derive(Clone, Debug, PartialEq, Eq)]
pub enum Expr {
    /// An empty regex (which never matches any text).
    Empty,
    /// A sequence of one or more literal characters to be matched.
    Literal {
        /// The characters.
        chars: Vec<char>,
        /// Whether to match case insensitively.
        casei: bool,
    },
    /// Match any character, excluding new line.
    AnyChar,
    /// Match any character.
    AnyCharNoNL,
    /// A character class.
    Class(CharClass),
    /// Match the start of a line or beginning of input.
    StartLine,
    /// Match the end of a line or end of input.
    EndLine,
    /// Match the beginning of input.
    StartText,
    /// Match the end of input.
    EndText,
    /// Match a word boundary (word character on one side and a non-word
    /// character on the other).
    WordBoundary,
    /// Match a position that is not a word boundary (word or non-word
    /// characters on both sides).
    NotWordBoundary,
    /// A group, possibly non-capturing.
    Group {
        /// The expression inside the group.
        e: Box<Expr>,
        /// The capture index (starting at `1`) only for capturing groups.
        i: Option<usize>,
        /// The capture name, only for capturing named groups.
        name: Option<String>,
    },
    /// A repeat operator (`?`, `*`, `+` or `{m,n}`).
    Repeat {
        /// The expression to be repeated. Limited to literals, `.`, classes
        /// or grouped expressions.
        e: Box<Expr>,
        /// The type of repeat operator used.
        r: Repeater,
        /// Whether the repeat is greedy (match the most) or not (match the
        /// least).
        greedy: bool,
    },
    /// A concatenation of expressions. Must be matched one after the other.
    ///
    /// N.B. A concat expression can only appear at the top-level or
    /// immediately inside a group expression.
    Concat(Vec<Expr>),
    /// An alternation of expressions. Only one must match.
    ///
    /// N.B. An alternate expression can only appear at the top-level or
    /// immediately inside a group expression.
    Alternate(Vec<Expr>),
}

type CaptureIndex = Option<usize>;

type CaptureName = Option<String>;

/// The type of a repeat operator expression.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum Repeater {
    /// Match zero or one (`?`).
    ZeroOrOne,
    /// Match zero or more (`*`).
    ZeroOrMore,
    /// Match one or more (`+`).
    OneOrMore,
    /// Match for at least `min` and at most `max` (`{m,n}`).
    ///
    /// When `max` is `None`, there is no upper bound on the number of matches.
    Range {
        /// Lower bound on the number of matches.
        min: u32,
        /// Optional upper bound on the number of matches.
        max: Option<u32>,
    },
}

/// A character class.
///
/// A character class has a canonical format that the parser guarantees. Its
/// canonical format is defined by the following invariants:
///
/// 1. Given any Unicode scalar value, it is matched by *at most* one character
///    range in a canonical character class.
/// 2. Every adjacent character range is separated by at least one Unicode
///    scalar value.
/// 3. Given any pair of character ranges `r1` and `r2`, if
///    `r1.end < r2.start`, then `r1` comes before `r2` in a canonical
///    character class.
///
/// In sum, any `CharClass` produced by this crate's parser is a sorted
/// sequence of non-overlapping ranges. This makes it possible to test whether
/// a character is matched by a class with a binary search.
///
/// Additionally, a character class may be marked *case insensitive*. If it's
/// case insensitive, then:
///
/// 1. Simple case folding has been applied to all ranges.
/// 2. Simple case folding must be applied to a character before testing
///    whether it matches the character class.
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct CharClass {
    ranges: Vec<ClassRange>,
    casei: bool,
}

/// A single inclusive range in a character class.
///
/// Since range boundaries are defined by Unicode scalar values, the boundaries
/// can never be in the open interval `(0xD7FF, 0xE000)`. However, a range may
/// *cover* codepoints that are not scalar values.
///
/// Note that this has a few convenient impls on `PartialEq` and `PartialOrd`
/// for testing whether a character is contained inside a given range.
#[derive(Clone, Copy, Debug, PartialEq, PartialOrd, Eq, Ord)]
pub struct ClassRange {
    /// The start character of the range.
    ///
    /// This must be less than or equal to `end`.
    pub start: char,

    /// The end character of the range.
    ///
    /// This must be greater than or equal to `end`.
    pub end: char,
}

impl Expr {
    /// Parses a string in a regular expression syntax tree.
    pub fn parse(s: &str) -> Result<Expr> {
        parser::Parser::parse(s).map(|e| e.simplify())
    }

    /// Returns true iff the expression can be repeated by a quantifier.
    fn can_repeat(&self) -> bool {
        match *self {
            Literal{..}
            | AnyChar
            | AnyCharNoNL
            | Class(_)
            | StartLine | EndLine | StartText | EndText
            | WordBoundary | NotWordBoundary
            | Group{..}
            => true,
            _ => false,
        }
    }

    fn simplify(self) -> Expr {
        fn combine_literals(es: &mut Vec<Expr>, e: Expr) {
            match (es.pop(), e) {
                (None, e) => es.push(e),
                (Some(Literal { chars: mut chars1, casei: casei1 }),
                 Literal { chars: chars2, casei: casei2 }) => {
                    if casei1 == casei2 {
                        chars1.extend(chars2);
                        es.push(Literal { chars: chars1, casei: casei1 });
                    } else {
                        es.push(Literal { chars: chars1, casei: casei1 });
                        es.push(Literal { chars: chars2, casei: casei2 });
                    }
                }
                (Some(e1), e2) => {
                    es.push(e1);
                    es.push(e2);
                }
            }
        }
        match self {
            Repeat { e, r, greedy } => Repeat {
                e: Box::new(e.simplify()),
                r: r,
                greedy: greedy,
            },
            Group { e, i, name } => {
                let e = e.simplify();
                if i.is_none() && name.is_none() && e.can_repeat() {
                    e
                } else {
                    Group { e: Box::new(e), i: i, name: name }
                }
            }
            Concat(es) => {
                let mut new_es = Vec::with_capacity(es.len());
                for e in es {
                    combine_literals(&mut new_es, e.simplify());
                }
                if new_es.len() == 1 {
                    new_es.pop().unwrap()
                } else {
                    Concat(new_es)
                }
            }
            Alternate(es) => Alternate(es.into_iter()
                                         .map(|e| e.simplify())
                                         .collect()),
            e => e,
        }
    }
}

impl Deref for CharClass {
    type Target = Vec<ClassRange>;
    fn deref(&self) -> &Vec<ClassRange> { &self.ranges }
}

impl IntoIterator for CharClass {
    type Item = ClassRange;
    type IntoIter = vec::IntoIter<ClassRange>;
    fn into_iter(self) -> vec::IntoIter<ClassRange> { self.ranges.into_iter() }
}

impl<'a> IntoIterator for &'a CharClass {
    type Item = &'a ClassRange;
    type IntoIter = slice::Iter<'a, ClassRange>;
    fn into_iter(self) -> slice::Iter<'a, ClassRange> { self.iter() }
}

impl CharClass {
    /// Create a new class from an existing set of ranges.
    fn new(ranges: Vec<ClassRange>) -> CharClass {
        CharClass { ranges: ranges, casei: false }
    }

    /// Create an empty class.
    fn empty() -> CharClass {
        CharClass::new(Vec::new())
    }

    /// Returns true if `c` is matched by this character class.
    ///
    /// If this character class is case insensitive, then simple case folding
    /// is applied to `c` before checking for a match.
    pub fn matches(&self, mut c: char) -> bool {
        if self.is_case_insensitive() {
            c = simple_case_fold(c)
        }
        self.binary_search_by(|range| c.partial_cmp(range).unwrap()).is_ok()
    }

    /// Returns true if this character class should be matched case
    /// insensitively.
    ///
    /// When `true`, simple case folding has already been applied to the
    /// class.
    pub fn is_case_insensitive(&self) -> bool {
        self.casei
    }

    /// Create a new empty class from this one.
    ///
    /// Namely, its capacity and case insensitive setting will be the same.
    fn to_empty(&self) -> CharClass {
        CharClass { ranges: Vec::with_capacity(self.len()), casei: self.casei }
    }

    /// Merge two classes and canonicalize them.
    #[cfg(test)]
    fn merge(mut self, other: CharClass) -> CharClass {
        self.ranges.extend(other);
        self.canonicalize()
    }

    /// Canonicalze any sequence of ranges.
    ///
    /// This is responsible for enforcing the canonical format invariants
    /// as described on the docs for the `CharClass` type.
    fn canonicalize(mut self) -> CharClass {
        // TODO: Save some cycles here by checking if already canonicalized.
        self.ranges.sort();
        let mut ordered = self.to_empty(); // TODO: Do this in place?
        for candidate in self {
            // If the candidate overlaps with an existing range, then it must
            // be the most recent range added because we process the candidates
            // in order.
            if let Some(or) = ordered.ranges.last_mut() {
                if or.overlapping(candidate) {
                    *or = or.merge(candidate);
                    continue;
                }
            }
            ordered.ranges.push(candidate);
        }
        ordered
    }

    /// Negates the character class.
    ///
    /// For all `c` where `c` is a Unicode scalar value, `c` matches `self`
    /// if and only if `c` does not match `self.negate()`.
    ///
    /// Note that this cannot be called on a character class that has had
    /// case folding applied to it. (Because case folding turns on a flag
    /// and doesn't store every possible matching character. Therefore,
    /// its negation is tricky to get right. Turns out, we don't need it
    /// anyway!)
    fn negate(mut self) -> CharClass {
        fn range(s: char, e: char) -> ClassRange { ClassRange::new(s, e) }

        // Never allow negating of a class that has been case folded!
        assert!(!self.casei);

        if self.is_empty() { return self; }
        self = self.canonicalize();
        let mut inv = self.to_empty();
        if self[0].start > '\x00' {
            inv.ranges.push(range('\x00', dec_char(self[0].start)));
        }
        for win in self.windows(2) {
            inv.ranges.push(range(inc_char(win[0].end),
                                  dec_char(win[1].start)));
        }
        if self[self.len() - 1].end < char::MAX {
            inv.ranges.push(range(inc_char(self[self.len() - 1].end),
                                  char::MAX));
        }
        inv
    }

    /// Apply case folding to this character class.
    ///
    /// One a class had been case folded, it cannot be negated.
    fn case_fold(self) -> CharClass {
        let mut folded = self.to_empty();
        folded.casei = true;
        for r in self {
            // Applying case folding to a range is expensive because *every*
            // character needed to be examined. Thus, we avoid that drudgery
            // if no character in the current range is in our case folding
            // table.
            if r.needs_case_folding() {
                folded.ranges.extend(r.case_fold());
            } else {
                folded.ranges.push(r);
            }
        }
        folded.canonicalize()
    }
}

impl ClassRange {
    /// Create a new class range.
    ///
    /// If `end < start`, then the two values are swapped so that
    /// the invariant `start <= end` is preserved.
    fn new(start: char, end: char) -> ClassRange {
        if start <= end {
            ClassRange { start: start, end: end }
        } else {
            ClassRange { start: end, end: start }
        }
    }

    /// Create a range of one character.
    fn one(c: char) -> ClassRange {
        ClassRange { start: c, end: c }
    }

    /// Returns true if and only if the two ranges are overlapping. Note that
    /// since ranges are inclusive, `a-c` and `d-f` are overlapping!
    fn overlapping(self, other: ClassRange) -> bool {
        max(self.start, other.start) <= inc_char(min(self.end, other.end))
    }

    /// Creates a new range representing the union of `self` and `other.
    fn merge(self, other: ClassRange) -> ClassRange {
        ClassRange {
            start: min(self.start, other.start),
            end: max(self.end, other.end),
        }
    }

    /// Returns true if and only if this range contains a character that is
    /// in the case folding table.
    fn needs_case_folding(self) -> bool {
        case_folding::C_plus_S_table
        .binary_search_by(|&(c, _)| self.partial_cmp(&c).unwrap()).is_ok()
    }

    /// Apply case folding to this range.
    ///
    /// Since case folding might add characters such that the range is no
    /// longer contiguous, this returns multiple class ranges. They are in
    /// canonical order.
    fn case_fold(self) -> Vec<ClassRange> {
        let (s, e) = (self.start as u32, self.end as u32 + 1);
        let mut start = simple_case_fold(self.start);
        let mut end = start;
        let mut next_case_fold = self.start;
        let mut ranges = Vec::with_capacity(100);
        for mut c in (s+1..e).filter_map(char::from_u32) {
            if c >= next_case_fold {
                c = match simple_case_fold_result(c) {
                    Ok(i) => case_folding::C_plus_S_table[i].1,
                    Err(i) => {
                        if i < case_folding::C_plus_S_table.len() {
                            next_case_fold = case_folding::C_plus_S_table[i].0;
                        } else {
                            next_case_fold = '\u{10FFFF}'
                        }
                        c
                    }
                };
            }
            if c != inc_char(end) {
                ranges.push(ClassRange::new(start, end));
                start = c;
            }
            end = c;
        }
        ranges.push(ClassRange::new(start, end));
        ranges
    }
}

impl PartialEq<char> for ClassRange {
    #[inline]
    fn eq(&self, other: &char) -> bool {
        self.start <= *other && *other <= self.end
    }
}

impl PartialEq<ClassRange> for char {
    #[inline]
    fn eq(&self, other: &ClassRange) -> bool {
        other.eq(self)
    }
}

impl PartialOrd<char> for ClassRange {
    #[inline]
    fn partial_cmp(&self, other: &char) -> Option<Ordering> {
        Some(if self == other {
            Ordering::Equal
        } else if *other > self.end {
            Ordering::Greater
        } else {
            Ordering::Less
        })
    }
}

impl PartialOrd<ClassRange> for char {
    #[inline]
    fn partial_cmp(&self, other: &ClassRange) -> Option<Ordering> {
        other.partial_cmp(self).map(|o| o.reverse())
    }
}

/// This implementation of `Display` will write a regular expression from the
/// syntax tree. It does not write the original string parsed.
impl fmt::Display for Expr {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        match *self {
            Empty => write!(f, ""),
            Literal { ref chars, casei } => {
                if casei { try!(write!(f, "(?i:")); }
                for &c in chars {
                    try!(write!(f, "{}", quote_char(c)));
                }
                if casei { try!(write!(f, ")")); }
                Ok(())
            }
            AnyChar => write!(f, "(?s:.)"),
            AnyCharNoNL => write!(f, "."),
            Class(ref cls) => write!(f, "{}", cls),
            StartLine => write!(f, "(?m:^)"),
            EndLine => write!(f, "(?m:$)"),
            StartText => write!(f, r"^"),
            EndText => write!(f, r"$"),
            WordBoundary => write!(f, r"\b"),
            NotWordBoundary => write!(f, r"\B"),
            Group { ref e, i: None, name: None } => write!(f, "(?:{})", e),
            Group { ref e, name: None, .. } => write!(f, "({})", e),
            Group { ref e, name: Some(ref n), .. } => {
                write!(f, "(?P<{}>{})", n, e)
            }
            Repeat { ref e, r, greedy } => {
                match &**e {
                    &Literal { ref chars, .. } if chars.len() > 1 => {
                        try!(write!(f, "(?:{}){}", e, r))
                    }
                    _ => try!(write!(f, "{}{}", e, r)),
                }
                if !greedy { try!(write!(f, "?")); }
                Ok(())
            }
            Concat(ref es) => {
                for e in es {
                    try!(write!(f, "{}", e));
                }
                Ok(())
            }
            Alternate(ref es) => {
                for (i, e) in es.iter().enumerate() {
                    if i > 0 { try!(write!(f, "|")); }
                    try!(write!(f, "{}", e));
                }
                Ok(())
            }
        }
    }
}

impl fmt::Display for Repeater {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        match *self {
            ZeroOrOne => write!(f, "?"),
            ZeroOrMore => write!(f, "*"),
            OneOrMore => write!(f, "+"),
            Range { min: s, max: None } => write!(f, "{{{},}}", s),
            Range { min: s, max: Some(e) } if s == e => write!(f, "{{{}}}", s),
            Range { min: s, max: Some(e) } => write!(f, "{{{}, {}}}", s, e),
        }
    }
}

impl fmt::Display for CharClass {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        if self.casei {
            try!(write!(f, "(?i:"));
        }
        try!(write!(f, "["));
        for range in self.iter() {
            try!(write!(f, "{}", range));
        }
        try!(write!(f, "]"));
        if self.casei {
            try!(write!(f, ")"));
        }
        Ok(())
    }
}

impl fmt::Display for ClassRange {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        write!(f, "{}-{}", quote_char(self.start), quote_char(self.end))
    }
}

/// An alias for computations that can return a `Error`.
pub type Result<T> = ::std::result::Result<T, Error>;

/// A parse error.
///
/// This includes details about the specific type of error and a rough
/// approximation of where it occurred.
#[derive(Clone, Debug, PartialEq)]
pub struct Error {
    pos: usize,
    surround: String,
    kind: ErrorKind,
}

/// The specific type of parse error that can occur.
#[derive(Clone, Debug, PartialEq)]
pub enum ErrorKind {
    /// A negation symbol is used twice in flag settings.
    /// e.g., `(?-i-s)`.
    DoubleFlagNegation,
    /// The same capture name was used more than once.
    /// e.g., `(?P<a>.)(?P<a>.)`.
    DuplicateCaptureName(String),
    /// An alternate is empty. e.g., `(|a)`.
    EmptyAlternate,
    /// A capture group name is empty. e.g., `(?P<>a)`.
    EmptyCaptureName,
    /// A negation symbol was not proceded by any flags. e.g., `(?i-)`.
    EmptyFlagNegation,
    /// A group is empty. e.g., `()`.
    EmptyGroup,
    /// An invalid number was used in a counted repetition. e.g., `a{b}`.
    InvalidBase10(String),
    /// An invalid hexadecimal number was used in an escape sequence.
    /// e.g., `\xAG`.
    InvalidBase16(String),
    /// An invalid capture name was used. e.g., `(?P<0a>b)`.
    InvalidCaptureName(String),
    /// An invalid class range was givien. Specifically, when the start of the
    /// range is greater than the end. e.g., `[z-a]`.
    InvalidClassRange {
        /// The first character specified in the range.
        start: char,
        /// The second character specified in the range.
        end: char,
    },
    /// An escape sequence was used in a character class where it is not
    /// allowed. e.g., `[a-\pN]` or `[\A]`.
    InvalidClassEscape(Expr),
    /// An invalid counted repetition min/max was given. e.g., `a{2,1}`.
    InvalidRepeatRange {
        /// The first number specified in the repetition.
        min: u32,
        /// The second number specified in the repetition.
        max: u32,
    },
    /// An invalid Unicode scalar value was used in a long hexadecimal
    /// sequence. e.g., `\x{D800}`.
    InvalidScalarValue(u32),
    /// An empty counted repetition operator. e.g., `a{}`.
    MissingBase10,
    /// A repetition operator was not applied to an expression. e.g., `*`.
    RepeaterExpectsExpr,
    /// A repetition operator was applied to an expression that cannot be
    /// repeated. e.g., `a+*` or `a|*`.
    RepeaterUnexpectedExpr(Expr),
    /// A capture group name that is never closed. e.g., `(?P<a`.
    UnclosedCaptureName(String),
    /// An unclosed hexadecimal literal. e.g., `\x{a`.
    UnclosedHex,
    /// An unclosed parenthesis. e.g., `(a`.
    UnclosedParen,
    /// An unclosed counted repetition operator. e.g., `a{2`.
    UnclosedRepeat,
    /// An unclosed named Unicode class. e.g., `\p{Yi`.
    UnclosedUnicodeName,
    /// Saw end of regex before class was closed. e.g., `[a`.
    UnexpectedClassEof,
    /// Saw end of regex before escape sequence was closed. e.g., `\`.
    UnexpectedEscapeEof,
    /// Saw end of regex before flags were closed. e.g., `(?i`.
    UnexpectedFlagEof,
    /// Saw end of regex before two hexadecimal digits were seen. e.g., `\xA`.
    UnexpectedTwoDigitHexEof,
    /// Unopened parenthesis. e.g., `)`.
    UnopenedParen,
    /// Unrecognized escape sequence. e.g., `\q`.
    UnrecognizedEscape(char),
    /// Unrecognized flag. e.g., `(?a)`.
    UnrecognizedFlag(char),
    /// Unrecognized named Unicode class. e.g., `\p{Foo}`.
    UnrecognizedUnicodeClass(String),
    /// Hints that destructuring should not be exhaustive.
    ///
    /// This enum may grow additional variants, so this makes sure clients
    /// don't count on exhaustive matching. (Otherwise, adding a new variant
    /// could break existing code.)
    #[doc(hidden)]
    __Nonexhaustive,
}

impl Error {
    /// Returns an approximate *character* offset at which the error occurred.
    ///
    /// The character offset may be equal to the number of characters in the
    /// string, in which case it should be interpreted as pointing to the end
    /// of the regex.
    pub fn position(&self) -> usize {
        self.pos
    }

    /// Returns the type of the regex parse error.
    pub fn kind(&self) -> &ErrorKind {
        &self.kind
    }
}

impl ErrorKind {
    fn description(&self) -> &str {
        use ErrorKind::*;
        match *self {
            DoubleFlagNegation => "double flag negation",
            DuplicateCaptureName(_) => "duplicate capture name",
            EmptyAlternate => "empty alternate",
            EmptyCaptureName => "empty capture name",
            EmptyFlagNegation => "flag negation without any flags",
            EmptyGroup => "empty group (e.g., '()')",
            InvalidBase10(_) => "invalid base 10 number",
            InvalidBase16(_) => "invalid base 16 number",
            InvalidCaptureName(_) => "invalid capture name",
            InvalidClassRange{..} => "invalid character class range",
            InvalidClassEscape(_) => "invalid escape sequence in class",
            InvalidRepeatRange{..} => "invalid counted repetition range",
            InvalidScalarValue(_) => "invalid Unicode scalar value",
            MissingBase10 => "missing count in repetition operator",
            RepeaterExpectsExpr => "repetition operator missing expression",
            RepeaterUnexpectedExpr(_) => "expression cannot be repeated",
            UnclosedCaptureName(_) => "unclosed capture group name",
            UnclosedHex => "unclosed hexadecimal literal",
            UnclosedParen => "unclosed parenthesis",
            UnclosedRepeat => "unclosed counted repetition operator",
            UnclosedUnicodeName => "unclosed Unicode class literal",
            UnexpectedClassEof => "unexpected EOF in character class",
            UnexpectedEscapeEof => "unexpected EOF in escape sequence",
            UnexpectedFlagEof => "unexpected EOF in flags",
            UnexpectedTwoDigitHexEof => "unexpected EOF in hex literal",
            UnopenedParen => "unopened parenthesis",
            UnrecognizedEscape(_) => "unrecognized escape sequence",
            UnrecognizedFlag(_) => "unrecognized flag",
            UnrecognizedUnicodeClass(_) => "unrecognized Unicode class name",
            __Nonexhaustive => unreachable!(),
        }
    }
}

impl ::std::error::Error for Error {
    fn description(&self) -> &str {
        self.kind.description()
    }
}

impl fmt::Display for Error {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        write!(f, "Error parsing regex near '{}' at character offset {}: {}",
               self.surround, self.pos, self.kind)
    }
}

impl fmt::Display for ErrorKind {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        use ErrorKind::*;
        match *self {
            DoubleFlagNegation =>
                write!(f, "Only one negation symbol is allowed in flags."),
            DuplicateCaptureName(ref s) =>
                write!(f, "Capture name '{}' is used more than once.", s),
            EmptyAlternate =>
                write!(f, "Alternations cannot be empty."),
            EmptyCaptureName =>
                write!(f, "Capture names cannot be empty."),
            EmptyFlagNegation =>
                write!(f, "Flag negation requires setting at least one flag."),
            EmptyGroup =>
                write!(f, "Empty regex groups (e.g., '()') are not allowed."),
            InvalidBase10(ref s) =>
                write!(f, "Not a valid base 10 number: '{}'", s),
            InvalidBase16(ref s) =>
                write!(f, "Not a valid base 16 number: '{}'", s),
            InvalidCaptureName(ref s) =>
                write!(f, "Invalid capture name: '{}'. Capture names must \
                           consist of [_a-zA-Z0-9] and are not allowed to \
                           start with with a number.", s),
            InvalidClassRange { start, end } =>
                write!(f, "Invalid character class range '{}-{}'. \
                           Character class ranges must start with the smaller \
                           character, but {} > {}", start, end, start, end),
            InvalidClassEscape(ref e) =>
                write!(f, "Invalid escape sequence in character \
                           class: '{}'.", e),
            InvalidRepeatRange { min, max } =>
                write!(f, "Invalid counted repetition range: {{{}, {}}}. \
                           Counted repetition ranges must start with the \
                           minimum, but {} > {}", min, max, min, max),
            InvalidScalarValue(c) =>
                write!(f, "Number does not correspond to a Unicode scalar \
                           value: '{}'.", c),
            MissingBase10 =>
                write!(f, "Missing maximum in counted reptition operator."),
            RepeaterExpectsExpr =>
                write!(f, "Missing expression for reptition operator."),
            RepeaterUnexpectedExpr(ref e) =>
                write!(f, "Invalid application of reptition operator to: \
                          '{}'.", e),
            UnclosedCaptureName(ref s) =>
                write!(f, "Capture name group for '{}' is not closed. \
                           (Missing a '>'.)", s),
            UnclosedHex =>
                write!(f, "Unclosed hexadecimal literal (missing a '}}')."),
            UnclosedParen =>
                write!(f, "Unclosed parenthesis."),
            UnclosedRepeat =>
                write!(f, "Unclosed counted repetition (missing a '}}')."),
            UnclosedUnicodeName =>
                write!(f, "Unclosed Unicode literal (missing a '}}')."),
            UnexpectedClassEof =>
                write!(f, "Character class was not closed before the end of \
                           the regex (missing a ']')."),
            UnexpectedEscapeEof =>
                write!(f, "Started an escape sequence that didn't finish \
                           before the end of the regex."),
            UnexpectedFlagEof =>
                write!(f, "Inline flag settings was not closed before the end \
                           of the regex (missing a ')' or ':')."),
            UnexpectedTwoDigitHexEof =>
                write!(f, "Unexpected end of two digit hexadecimal literal."),
            UnopenedParen =>
                write!(f, "Unopened parenthesis."),
            UnrecognizedEscape(c) =>
                write!(f, "Unrecognized escape sequence: '\\{}'.", c),
            UnrecognizedFlag(c) =>
                write!(f, "Unrecognized flag: '{}'. \
                           (Allowed flags: i, s, m, U, x.)", c),
            UnrecognizedUnicodeClass(ref s) =>
                write!(f, "Unrecognized Unicode class name: '{}'.", s),
            __Nonexhaustive => unreachable!(),
        }
    }
}

/// Returns the Unicode *simple* case folding of `c`.
///
/// N.B. This is hidden because it really isn't the responsibility of this
/// crate to do simple case folding. One hopes that either another crate or
/// the standard library will be able to do this for us. In any case, we still
/// expose it because it is used inside the various Regex engines.
#[doc(hidden)]
pub fn simple_case_fold(c: char) -> char {
    simple_case_fold_result(c)
        .map(|i| case_folding::C_plus_S_table[i].1)
        .unwrap_or(c)
}

/// The result of binary search on the simple case folding table.
///
/// This level of detail is exposed so that we can do case folding on a
/// range of characters efficiently.
fn simple_case_fold_result(c: char) -> ::std::result::Result<usize, usize> {
    case_folding::C_plus_S_table.binary_search_by(|&(x, _)| x.cmp(&c))
}

/// Escapes all regular expression meta characters in `text`.
///
/// The string returned may be safely used as a literal in a regular
/// expression.
pub fn quote(text: &str) -> String {
    let mut quoted = String::with_capacity(text.len());
    for c in text.chars() {
        if parser::is_punct(c) {
            quoted.push('\\');
        }
        quoted.push(c);
    }
    quoted
}

fn quote_char(c: char) -> String {
    let mut s = String::new();
    if parser::is_punct(c) {
        s.push('\\');
    }
    s.push(c);
    s
}

fn inc_char(c: char) -> char {
    match c {
        char::MAX => char::MAX,
        '\u{D7FF}' => '\u{E000}',
        c => char::from_u32(c as u32 + 1).unwrap(),
    }
}

fn dec_char(c: char) -> char {
    match c {
        '\x00' => '\x00',
        '\u{E000}' => '\u{D7FF}',
        c => char::from_u32(c as u32 - 1).unwrap(),
    }
}

/// Returns true if and only if `c` is a word character.
#[doc(hidden)]
pub fn is_word_char(c: char) -> bool {
    match c {
        '_' | '0' ... '9' | 'a' ... 'z' | 'A' ... 'Z'  => true,
        _ => ::unicode::regex::PERLW.binary_search_by(|&(start, end)| {
            if c >= start && c <= end {
                Ordering::Equal
            } else if start > c {
                Ordering::Greater
            } else {
                Ordering::Less
            }
        }).is_ok(),
    }
}

#[cfg(test)]
mod properties;

#[cfg(test)]
mod tests {
    use {CharClass, ClassRange};

    fn class(ranges: &[(char, char)]) -> CharClass {
        let ranges = ranges.iter().cloned()
                           .map(|(c1, c2)| ClassRange::new(c1, c2)).collect();
        CharClass::new(ranges)
    }

    fn classi(ranges: &[(char, char)]) -> CharClass {
        let mut cls = class(ranges);
        cls.casei = true;
        cls
    }

    #[test]
    fn class_canon_no_change() {
        let cls = class(&[('a', 'c'), ('x', 'z')]);
        assert_eq!(cls.clone().canonicalize(), cls);
    }

    #[test]
    fn class_canon_unordered() {
        let cls = class(&[('x', 'z'), ('a', 'c')]);
        assert_eq!(cls.canonicalize(), class(&[
            ('a', 'c'), ('x', 'z'),
        ]));
    }

    #[test]
    fn class_canon_overlap() {
        let cls = class(&[('x', 'z'), ('w', 'y')]);
        assert_eq!(cls.canonicalize(), class(&[
            ('w', 'z'),
        ]));
    }

    #[test]
    fn class_canon_overlap_many() {
        let cls = class(&[
            ('c', 'f'), ('a', 'g'), ('d', 'j'), ('a', 'c'),
            ('m', 'p'), ('l', 's'),
        ]);
        assert_eq!(cls.clone().canonicalize(), class(&[
            ('a', 'j'), ('l', 's'),
        ]));
    }

    #[test]
    fn class_canon_overlap_many_case_fold() {
        let cls = class(&[
            ('C', 'F'), ('A', 'G'), ('D', 'J'), ('A', 'C'),
            ('M', 'P'), ('L', 'S'), ('c', 'f'),
        ]);
        assert_eq!(cls.case_fold(), classi(&[
            ('a', 'j'), ('l', 's'),
        ]));
    }

    #[test]
    fn class_canon_overlap_boundary() {
        let cls = class(&[('x', 'z'), ('u', 'w')]);
        assert_eq!(cls.canonicalize(), class(&[
            ('u', 'z'),
        ]));
    }

    #[test]
    fn class_canon_extreme_edge_case() {
        let cls = class(&[('\x00', '\u{10FFFF}'), ('\x00', '\u{10FFFF}')]);
        assert_eq!(cls.canonicalize(), class(&[
            ('\x00', '\u{10FFFF}'),
        ]));
    }

    #[test]
    fn class_canon_singles() {
        let cls = class(&[('a', 'a'), ('b', 'b')]);
        assert_eq!(cls.canonicalize(), class(&[('a', 'b')]));
    }

    #[test]
    fn class_negate_single() {
        let cls = class(&[('a', 'a')]);
        assert_eq!(cls.negate(), class(&[
            ('\x00', '\x60'), ('\x62', '\u{10FFFF}'),
        ]));
    }

    #[test]
    fn class_negate_singles() {
        let cls = class(&[('a', 'a'), ('b', 'b')]);
        assert_eq!(cls.negate(), class(&[
            ('\x00', '\x60'), ('\x63', '\u{10FFFF}'),
        ]));
    }

    #[test]
    fn class_negate_multiples() {
        let cls = class(&[('a', 'c'), ('x', 'z')]);
        assert_eq!(cls.negate(), class(&[
            ('\x00', '\x60'), ('\x64', '\x77'), ('\x7b', '\u{10FFFF}'),
        ]));
    }

    #[test]
    fn class_negate_min_scalar() {
        let cls = class(&[('\x00', 'a')]);
        assert_eq!(cls.negate(), class(&[
            ('\x62', '\u{10FFFF}'),
        ]));
    }

    #[test]
    fn class_negate_max_scalar() {
        let cls = class(&[('a', '\u{10FFFF}')]);
        assert_eq!(cls.negate(), class(&[
            ('\x00', '\x60'),
        ]));
    }

    #[test]
    fn class_negate_everything() {
        let cls = class(&[('\x00', '\u{10FFFF}')]);
        assert_eq!(cls.negate(), class(&[]));
    }

    #[test]
    fn class_negate_everything_sans_one() {
        let cls = class(&[
            ('\x00', '\u{10FFFD}'), ('\u{10FFFF}', '\u{10FFFF}')
        ]);
        assert_eq!(cls.negate(), class(&[
            ('\u{10FFFE}', '\u{10FFFE}'),
        ]));
    }

    #[test]
    fn class_negate_surrogates_min() {
        let cls = class(&[('\x00', '\u{D7FF}')]);
        assert_eq!(cls.negate(), class(&[
            ('\u{E000}', '\u{10FFFF}'),
        ]));
    }

    #[test]
    fn class_negate_surrogates_min_edge() {
        let cls = class(&[('\x00', '\u{D7FE}')]);
        assert_eq!(cls.negate(), class(&[
            ('\u{D7FF}', '\u{10FFFF}'),
        ]));
    }

    #[test]
    fn class_negate_surrogates_max() {
        let cls = class(&[('\u{E000}', '\u{10FFFF}')]);
        assert_eq!(cls.negate(), class(&[
            ('\x00', '\u{D7FF}'),
        ]));
    }

    #[test]
    fn class_negate_surrogates_max_edge() {
        let cls = class(&[('\u{E001}', '\u{10FFFF}')]);
        assert_eq!(cls.negate(), class(&[
            ('\x00', '\u{E000}'),
        ]));
    }

    #[test]
    fn class_fold_retain_only_needed() {
        let cls = class(&[('A', 'Z'), ('a', 'z')]);
        assert_eq!(cls.case_fold(), classi(&[
            ('a', 'z'),
        ]));
    }

    #[test]
    fn class_fold_az() {
        let cls = class(&[('A', 'Z')]);
        assert_eq!(cls.case_fold(), classi(&[
            ('a', 'z'),
        ]));
    }

    #[test]
    fn class_fold_a_underscore() {
        let cls = class(&[('A', 'A'), ('_', '_')]);
        assert_eq!(cls.clone().canonicalize(), class(&[
            ('A', 'A'), ('_', '_'),
        ]));
        assert_eq!(cls.case_fold(), classi(&[
            ('_', '_'), ('a', 'a'),
        ]));
    }

    #[test]
    fn class_fold_a_equals() {
        let cls = class(&[('A', 'A'), ('=', '=')]);
        assert_eq!(cls.clone().canonicalize(), class(&[
            ('=', '='), ('A', 'A'),
        ]));
        assert_eq!(cls.case_fold(), classi(&[
            ('=', '='), ('a', 'a'),
        ]));
    }

    #[test]
    fn class_fold_no_folding_needed() {
        let cls = class(&[('\x00', '\x10')]);
        assert_eq!(cls.case_fold(), classi(&[
            ('\x00', '\x10'),
        ]));
    }
}
Keyboard shortcuts

Search tricks