Speed up EDN parser (fixes #445) (#581) r=nalexander

Fixes from @kevinmehall.

* Prefer character sets over backtracking in the EDN parser.
* Avoid duplicate effort when parsing floats in the EDN parser.
* Clean up duplicate position tracking code.

This turns out to have little performance impact, but makes the grammar
much cleaner.

* Fix EDN work to pass tests with correct numeric precedence.
This commit is contained in:
Richard Newman 2018-03-05 20:33:51 -08:00 committed by GitHub
parent 30bf827d16
commit 9b23cf3945
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 80 additions and 182 deletions

View file

@ -34,105 +34,40 @@ use types::{SpannedValue, Span, ValueAndSpan};
// TODO: Support tagged elements
// TODO: Support discard
pub nil -> ValueAndSpan =
start:#position "nil" end:#position {
ValueAndSpan {
inner: SpannedValue::Nil,
span: Span::new(start, end)
}
}
pub nil -> SpannedValue = "nil" { SpannedValue::Nil }
pub nan -> SpannedValue = "#f" whitespace+ "NaN" { SpannedValue::Float(OrderedFloat(NAN)) }
pub nan -> ValueAndSpan =
start:#position "#f" whitespace+ "NaN" end:#position {
ValueAndSpan {
inner: SpannedValue::Float(OrderedFloat(NAN)),
span: Span::new(start, end)
}
}
pub infinity -> SpannedValue = "#f" whitespace+ s:$(sign) "Infinity"
{ SpannedValue::Float(OrderedFloat(if s == "+" { INFINITY } else { NEG_INFINITY })) }
pub infinity -> ValueAndSpan =
start:#position "#f" whitespace+ s:$(sign) "Infinity" end:#position {
ValueAndSpan {
inner: SpannedValue::Float(OrderedFloat(if s == "+" { INFINITY } else { NEG_INFINITY })),
span: Span::new(start, end)
}
}
pub boolean -> ValueAndSpan =
start:#position "true" end:#position {
ValueAndSpan {
inner: SpannedValue::Boolean(true),
span: Span::new(start, end)
}
} /
start:#position "false" end:#position {
ValueAndSpan {
inner: SpannedValue::Boolean(false),
span: Span::new(start, end)
}
}
pub boolean -> SpannedValue
= "true" { SpannedValue::Boolean(true) }
/ "false" { SpannedValue::Boolean(false) }
digit = [0-9]
alphanumeric = [0-9a-zA-Z]
octaldigit = [0-7]
validbase = [3][0-6] / [12][0-9] / [2-9]
hex = [0-9a-fA-F]
sign = "-" / "+"
sign = [+-]
pub bigint -> ValueAndSpan =
start:#position b:$( sign? digit+ ) "N" end:#position {
ValueAndSpan {
inner: SpannedValue::BigInteger(b.parse::<BigInt>().unwrap()),
span: Span::new(start, end)
}
}
pub bigint -> SpannedValue = b:$( sign? digit+ ) "N"
{ SpannedValue::BigInteger(b.parse::<BigInt>().unwrap()) }
pub octalinteger -> SpannedValue = "0" i:$( octaldigit+ )
{ SpannedValue::Integer(i64::from_str_radix(i, 8).unwrap()) }
pub hexinteger -> SpannedValue = "0x" i:$( hex+ )
{ SpannedValue::Integer(i64::from_str_radix(i, 16).unwrap()) }
pub octalinteger -> ValueAndSpan =
start:#position "0" i:$( octaldigit+ ) end:#position {
ValueAndSpan {
inner: SpannedValue::Integer(i64::from_str_radix(i, 8).unwrap()),
span: Span::new(start, end)
}
}
pub basedinteger -> SpannedValue = b:$( validbase ) "r" i:$( alphanumeric+ )
{ SpannedValue::Integer(i64::from_str_radix(i, b.parse::<u32>().unwrap()).unwrap()) }
pub hexinteger -> ValueAndSpan =
start:#position "0x" i:$( hex+ ) end:#position {
ValueAndSpan {
inner: SpannedValue::Integer(i64::from_str_radix(i, 16).unwrap()),
span: Span::new(start, end)
}
}
pub integer -> SpannedValue = i:$( sign? digit+ ) !("." / ([eE]))
{ SpannedValue::Integer(i.parse::<i64>().unwrap()) }
pub basedinteger -> ValueAndSpan =
// Only allow values 2-36
start:#position b:$( validbase ) "r" i:$( alphanumeric+ ) end:#position {
ValueAndSpan {
inner: SpannedValue::Integer(i64::from_str_radix(i, b.parse::<u32>().unwrap()).unwrap()),
span: Span::new(start, end)
}
}
pub float -> SpannedValue = f:$(sign? digit+ ("." digit+)? ([eE] sign? digit+)?)
{ SpannedValue::Float(OrderedFloat(f.parse::<f64>().unwrap())) }
pub integer -> ValueAndSpan =
start:#position i:$( sign? digit+ ) end:#position {
ValueAndSpan {
inner: SpannedValue::Integer(i.parse::<i64>().unwrap()),
span: Span::new(start, end)
}
}
frac = sign? digit+ "." digit+
exp = sign? digit+ ("e" / "E") sign? digit+
frac_exp = sign? digit+ "." digit+ ("e" / "E") sign? digit+
// The order here is important - frac_exp must come before (exp / frac) or the
// parser assumes exp or frac when the float is really a frac_exp and fails
pub float -> ValueAndSpan =
start:#position f:$( frac_exp / exp / frac ) end:#position {
ValueAndSpan {
inner: SpannedValue::Float(OrderedFloat(f.parse::<f64>().unwrap())),
span: Span::new(start, end)
}
}
number -> SpannedValue = ( bigint / basedinteger / hexinteger / octalinteger / integer / float )
// TODO: \newline, \return, \space and \tab
special_char = quote / tab
@ -140,19 +75,13 @@ quote = "\\\""
tab = "\\tab"
char = [^"] / special_char
pub text -> ValueAndSpan =
start:#position "\"" t:$( char* ) "\"" end:#position {
ValueAndSpan {
inner: SpannedValue::Text(t.to_string()),
span: Span::new(start, end)
}
}
pub text -> SpannedValue = "\"" t:$( char* ) "\""
{ SpannedValue::Text(t.to_string()) }
// RFC 3339 timestamps. #inst "1985-04-12T23:20:50.52Z"
// We accept an arbitrary depth of decimals.
// Note that we discard the timezone information -- all times are translated to UTC.
pub inst_string -> DateTime<Utc> =
inst_string -> DateTime<Utc> =
"#inst" whitespace+ "\"" d:$( [0-9]*<4> "-" [0-2][0-9] "-" [0-3][0-9]
"T"
[0-2][0-9] ":" [0-5][0-9] ":" [0-6][0-9]
@ -165,7 +94,7 @@ pub inst_string -> DateTime<Utc> =
.map_err(|_| "invalid datetime") // Oh, rustpeg.
}
pub inst_micros -> DateTime<Utc> =
inst_micros -> DateTime<Utc> =
"#instmicros" whitespace+ d:$( digit+ ) {
let micros = d.parse::<i64>().unwrap();
let seconds: i64 = micros / 1000000;
@ -173,7 +102,7 @@ pub inst_micros -> DateTime<Utc> =
Utc.timestamp(seconds, nanos)
}
pub inst_millis -> DateTime<Utc> =
inst_millis -> DateTime<Utc> =
"#instmillis" whitespace+ d:$( digit+ ) {
let millis = d.parse::<i64>().unwrap();
let seconds: i64 = millis / 1000;
@ -181,26 +110,16 @@ pub inst_millis -> DateTime<Utc> =
Utc.timestamp(seconds, nanos)
}
pub inst -> ValueAndSpan =
start:#position t:(inst_millis / inst_micros / inst_string) end:#position {
ValueAndSpan {
inner: SpannedValue::Instant(t),
span: Span::new(start, end)
}
}
inst -> SpannedValue = t:(inst_millis / inst_micros / inst_string)
{ SpannedValue::Instant(t) }
pub uuid_string -> Uuid =
uuid_string -> Uuid =
"\"" u:$( [a-f0-9]*<8> "-" [a-f0-9]*<4> "-" [a-f0-9]*<4> "-" [a-f0-9]*<4> "-" [a-f0-9]*<12> ) "\"" {
Uuid::parse_str(u).expect("this is a valid UUID string")
}
pub uuid -> ValueAndSpan =
start:#position "#uuid" whitespace+ u:(uuid_string) end:#position {
ValueAndSpan {
inner: SpannedValue::Uuid(u),
span: Span::new(start, end)
}
}
pub uuid -> SpannedValue = "#uuid" whitespace+ u:uuid_string
{ SpannedValue::Uuid(u) }
namespace_divider = "."
namespace_separator = "/"
@ -208,8 +127,8 @@ namespace_separator = "/"
// TODO: Be more picky here
// Keywords follow the rules of symbols, except they can (and must) begin with :
// e.g. :fred or :my/fred. See https://github.com/edn-format/edn#keywords
symbol_char_initial = [a-z] / [A-Z] / [0-9] / [*!_?$%&=<>]
symbol_char_subsequent = [a-z] / [A-Z] / [0-9] / [-*!_?$%&=<>]
symbol_char_initial = [a-zA-Z0-9*!_?$%&=<>]
symbol_char_subsequent = [a-zA-Z0-9*!_?$%&=<>-]
symbol_namespace = symbol_char_initial symbol_char_subsequent* (namespace_divider symbol_char_subsequent+)*
symbol_name = ( symbol_char_initial+ symbol_char_subsequent* )
@ -217,76 +136,47 @@ plain_symbol_name = symbol_name / "..." / "."
keyword_prefix = ":"
pub symbol -> ValueAndSpan =
start:#position
pub symbol -> SpannedValue =
ns:( sns:$(symbol_namespace) namespace_separator { sns })?
n:$(plain_symbol_name)
end:#position {
ValueAndSpan {
inner: SpannedValue::from_symbol(ns, n),
span: Span::new(start, end)
}
}
{ SpannedValue::from_symbol(ns, n) }
pub keyword -> ValueAndSpan =
start:#position
pub keyword -> SpannedValue =
keyword_prefix
ns:( sns:$(symbol_namespace) namespace_separator { sns })?
n:$(symbol_name)
end:#position {
ValueAndSpan {
inner: SpannedValue::from_keyword(ns, n),
span: Span::new(start, end)
}
}
{ SpannedValue::from_keyword(ns, n) }
pub list -> ValueAndSpan =
start:#position "(" __ v:(value)* __ ")" end:#position {
ValueAndSpan {
inner: SpannedValue::List(LinkedList::from_iter(v)),
span: Span::new(start, end)
}
}
pub list -> SpannedValue = "(" __ v:(value)* __ ")"
{ SpannedValue::List(LinkedList::from_iter(v)) }
pub vector -> ValueAndSpan =
start:#position "[" __ v:(value)* __ "]" end:#position {
ValueAndSpan {
inner: SpannedValue::Vector(v),
span: Span::new(start, end)
}
}
pub vector -> SpannedValue = "[" __ v:(value)* __ "]"
{ SpannedValue::Vector(v) }
pub set -> ValueAndSpan =
start:#position "#{" __ v:(value)* __ "}" end:#position {
ValueAndSpan {
inner: SpannedValue::Set(BTreeSet::from_iter(v)),
span: Span::new(start, end)
}
}
pub set -> SpannedValue = "#{" __ v:(value)* __ "}"
{ SpannedValue::Set(BTreeSet::from_iter(v)) }
pair -> (ValueAndSpan, ValueAndSpan) =
k:(value) v:(value) {
(k, v)
}
pub map -> ValueAndSpan =
start:#position "{" __ v:(pair)* __ "}" end:#position {
ValueAndSpan {
inner: SpannedValue::Map(BTreeMap::from_iter(v)),
span: Span::new(start, end)
}
}
pub map -> SpannedValue = "{" __ v:(pair)* __ "}"
{ SpannedValue::Map(BTreeMap::from_iter(v)) }
// It's important that float comes before integer or the parser assumes that
// floats are integers and fails to parse
pub value -> ValueAndSpan =
__ v:(nil / nan / infinity / boolean / float / octalinteger / hexinteger / basedinteger / inst / uuid / bigint / integer / text / keyword / symbol / list / vector / map / set) __ {
v
__ start:#position v:(nil / nan / infinity / boolean / number / inst / uuid / text / keyword / symbol / list / vector / map / set) end:#position __ {
ValueAndSpan {
inner: v,
span: Span::new(start, end)
}
}
// Clojure (and thus EDN) regards commas as whitespace, and thus the two-element vectors [1 2] and
// [1,,,,2] are equivalent, as are the maps {:a 1, :b 2} and {:a 1 :b 2}.
whitespace = (" " / "\r" / "\n" / "\t" / ",")
comment = ";" [^\r\n]* ("\r" / "\n")?
whitespace = [ \r\n\t,]
comment = ";" [^\r\n]* [\r\n]?
__ = (whitespace / comment)*

View file

@ -163,6 +163,12 @@ impl From<SpannedValue> for Value {
}
}
impl From<ValueAndSpan> for Value {
fn from(src: ValueAndSpan) -> Value {
src.inner.into()
}
}
/// Creates `from_$TYPE` helper functions for Value and SpannedValue,
/// like `from_float()` or `from_ordered_float()`.
macro_rules! def_from {
@ -617,7 +623,10 @@ mod test {
#[test]
fn test_print_edn() {
assert_eq!("1234N", Value::from_bigint("1234").unwrap().to_string());
let string = "[ 1 2 ( 3.14 ) #{ 4N } { foo/bar 42 :baz/boz 43 } [ ] :five :six/seven eight nine/ten true false nil #f NaN #f -Infinity #f +Infinity ]";
let data = Value::Vector(vec![
Value::Integer(1),
Value::Integer(2),

View file

@ -60,7 +60,7 @@ fn s_plain(name: &str) -> Value {
macro_rules! fn_parse_into_value {
($name: ident) => {
fn $name<'a, T>(src: T) -> Result<Value, ParseError> where T: Into<&'a str> {
parse::$name(src.into()).map(|x| x.inner.into())
parse::$name(src.into()).map(|x| x.into())
}
}
}
@ -98,7 +98,7 @@ fn test_nil() {
#[test]
fn test_span_nil() {
assert_eq!(parse::nil("nil").unwrap(), ValueAndSpan {
assert_eq!(parse::value("nil").unwrap(), ValueAndSpan {
inner: SpannedValue::Nil,
span: Span(0, 3)
});
@ -120,7 +120,7 @@ fn test_nan() {
#[test]
fn test_span_nan() {
assert_eq!(parse::nan("#f NaN").unwrap(), ValueAndSpan {
assert_eq!(parse::value("#f NaN").unwrap(), ValueAndSpan {
inner: SpannedValue::Float(OrderedFloat(f64::NAN)),
span: Span(0, 6)
});
@ -150,11 +150,11 @@ fn test_infinity() {
#[test]
fn test_span_infinity() {
assert_eq!(parse::infinity("#f -Infinity").unwrap(), ValueAndSpan {
assert_eq!(parse::value("#f -Infinity").unwrap(), ValueAndSpan {
inner: SpannedValue::Float(OrderedFloat(f64::NEG_INFINITY)),
span: Span(0, 12)
});
assert_eq!(parse::infinity("#f +Infinity").unwrap(), ValueAndSpan {
assert_eq!(parse::value("#f +Infinity").unwrap(), ValueAndSpan {
inner: SpannedValue::Float(OrderedFloat(f64::INFINITY)),
span: Span(0, 12)
});
@ -172,12 +172,12 @@ fn test_boolean() {
#[test]
fn test_span_boolean() {
assert_eq!(parse::boolean("true").unwrap(), ValueAndSpan {
assert_eq!(parse::value("true").unwrap(), ValueAndSpan {
inner: SpannedValue::Boolean(true),
span: Span(0, 4)
});
assert_eq!(parse::boolean("false").unwrap(), ValueAndSpan {
assert_eq!(parse::value("false").unwrap(), ValueAndSpan {
inner: SpannedValue::Boolean(false),
span: Span(0, 5)
});
@ -235,19 +235,19 @@ fn test_octalinteger() {
#[test]
fn test_span_integer() {
assert_eq!(parse::integer("42").unwrap(), ValueAndSpan {
assert_eq!(parse::value("42").unwrap(), ValueAndSpan {
inner: SpannedValue::Integer(42),
span: Span(0, 2)
});
assert_eq!(parse::hexinteger("0xabc111").unwrap(), ValueAndSpan {
assert_eq!(parse::value("0xabc111").unwrap(), ValueAndSpan {
inner: SpannedValue::Integer(11256081),
span: Span(0, 8)
});
assert_eq!(parse::basedinteger("2r111").unwrap(), ValueAndSpan {
assert_eq!(parse::value("2r111").unwrap(), ValueAndSpan {
inner: SpannedValue::Integer(7),
span: Span(0, 5)
});
assert_eq!(parse::octalinteger("011").unwrap(), ValueAndSpan {
assert_eq!(parse::value("011").unwrap(), ValueAndSpan {
inner: SpannedValue::Integer(9),
span: Span(0, 3)
});
@ -266,7 +266,6 @@ fn test_uuid() {
.expect("valid UUID");
let actual = parse::uuid("#uuid \"550e8400-e29b-41d4-a716-446655440000\"")
.expect("parse success")
.inner
.into();
assert_eq!(self::Value::Uuid(expected), actual);
}
@ -291,7 +290,7 @@ fn test_span_bigint() {
let max_i64 = i64::max_value().to_bigint().unwrap();
let bigger = &max_i64 * &max_i64;
assert_eq!(parse::bigint("85070591730234615847396907784232501249N").unwrap(), ValueAndSpan {
assert_eq!(parse::value("85070591730234615847396907784232501249N").unwrap(), ValueAndSpan {
inner: SpannedValue::BigInteger(bigger),
span: Span(0, 39)
});
@ -307,13 +306,13 @@ fn test_float() {
assert_eq!(float("77.88e99").unwrap(), Float(OrderedFloat(77.88e99f64)));
assert_eq!(float("-9.9E-9").unwrap(), Float(OrderedFloat(-9.9E-9f64)));
assert!(float("42").is_err());
assert_eq!(float("42").unwrap(), Float(OrderedFloat(42f64)));
assert!(float("nil").is_err());
}
#[test]
fn test_span_float() {
assert_eq!(parse::float("42.0").unwrap(), ValueAndSpan {
assert_eq!(parse::value("42.0").unwrap(), ValueAndSpan {
inner: SpannedValue::Float(OrderedFloat(42f64)),
span: Span(0, 4)
});
@ -332,7 +331,7 @@ fn test_text() {
#[test]
fn test_span_text() {
assert_eq!(parse::text("\"hello world\"").unwrap(), ValueAndSpan {
assert_eq!(parse::value("\"hello world\"").unwrap(), ValueAndSpan {
inner: SpannedValue::Text("hello world".to_string()),
span: Span(0, 13)
});
@ -359,11 +358,11 @@ fn test_symbol() {
#[test]
fn test_span_symbol() {
assert_eq!(parse::symbol("hello").unwrap(), ValueAndSpan {
assert_eq!(parse::value("hello").unwrap(), ValueAndSpan {
inner: SpannedValue::from_symbol(None, "hello"),
span: Span(0, 5)
});
assert_eq!(parse::symbol("hello/world").unwrap(), ValueAndSpan {
assert_eq!(parse::value("hello/world").unwrap(), ValueAndSpan {
inner: SpannedValue::from_symbol("hello", "world"),
span: Span(0, 11)
});
@ -389,11 +388,11 @@ fn test_keyword() {
#[test]
fn test_span_keyword() {
assert_eq!(parse::keyword(":hello").unwrap(), ValueAndSpan {
assert_eq!(parse::value(":hello").unwrap(), ValueAndSpan {
inner: SpannedValue::from_keyword(None, "hello"),
span: Span(0, 6)
});
assert_eq!(parse::keyword(":hello/world").unwrap(), ValueAndSpan {
assert_eq!(parse::value(":hello/world").unwrap(), ValueAndSpan {
inner: SpannedValue::from_keyword("hello", "world"),
span: Span(0, 12)
});