/* vim: set filetype=rust.rustpeg */ // Copyright 2016 Mozilla // // Licensed under the Apache License, Version 2.0 (the "License"); you may not use // this file except in compliance with the License. You may obtain a copy of the // License at http://www.apache.org/licenses/LICENSE-2.0 // Unless required by applicable law or agreed to in writing, software distributed // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. use std::collections::{BTreeSet, BTreeMap, LinkedList}; use std::iter::FromIterator; use std::f64::{NAN, INFINITY, NEG_INFINITY}; use chrono::{ DateTime, TimeZone, Utc }; use num::BigInt; use ordered_float::OrderedFloat; use uuid::Uuid; use types::{SpannedValue, Span, ValueAndSpan}; // Goal: Be able to parse https://github.com/edn-format/edn // Also extensible to help parse http://docs.datomic.com/query.html // Debugging hint: test using `cargo test --features peg/trace -- --nocapture` // to trace where the parser is failing // TODO: Support tagged elements // TODO: Support discard pub nil -> SpannedValue = "nil" { SpannedValue::Nil } pub nan -> SpannedValue = "#f" whitespace+ "NaN" { SpannedValue::Float(OrderedFloat(NAN)) } pub infinity -> SpannedValue = "#f" whitespace+ s:$(sign) "Infinity" { SpannedValue::Float(OrderedFloat(if s == "+" { INFINITY } else { NEG_INFINITY })) } pub boolean -> SpannedValue = "true" { SpannedValue::Boolean(true) } / "false" { SpannedValue::Boolean(false) } digit = [0-9] alphanumeric = [0-9a-zA-Z] octaldigit = [0-7] validbase = [3][0-6] / [12][0-9] / [2-9] hex = [0-9a-fA-F] sign = [+-] pub bigint -> SpannedValue = b:$( sign? digit+ ) "N" { SpannedValue::BigInteger(b.parse::().unwrap()) } pub octalinteger -> SpannedValue = "0" i:$( octaldigit+ ) { SpannedValue::Integer(i64::from_str_radix(i, 8).unwrap()) } pub hexinteger -> SpannedValue = "0x" i:$( hex+ ) { SpannedValue::Integer(i64::from_str_radix(i, 16).unwrap()) } pub basedinteger -> SpannedValue = b:$( validbase ) "r" i:$( alphanumeric+ ) { SpannedValue::Integer(i64::from_str_radix(i, b.parse::().unwrap()).unwrap()) } pub integer -> SpannedValue = i:$( sign? digit+ ) !("." / ([eE])) { SpannedValue::Integer(i.parse::().unwrap()) } pub float -> SpannedValue = f:$(sign? digit+ ("." digit+)? ([eE] sign? digit+)?) { SpannedValue::Float(OrderedFloat(f.parse::().unwrap())) } number -> SpannedValue = ( bigint / basedinteger / hexinteger / octalinteger / integer / float ) // TODO: standalone characters: \, \newline, \return, \space and \tab. string_special_char -> &'input str = "\\" $([\\"ntr]) string_normal_chars -> &'input str = $([^"\\]+) // This is what we need to do in order to unescape. We can't just match the entire string slice: // we get a Vec<&str> from rust-peg, where some of the parts might be unescaped special characters, // and we join it together to form an output string. // E.g., input = r#"\"foo\\\\bar\""# // output = [quote, "foo", backslash, "bar", quote] // result = r#""foo\\bar""# // For the typical case, string_normal_chars will match multiple, leading to a single-element vec. pub text -> SpannedValue = "\"" t:((string_special_char / string_normal_chars)*) "\"" { SpannedValue::Text(t.join(&"").to_string()) } // RFC 3339 timestamps. #inst "1985-04-12T23:20:50.52Z" // We accept an arbitrary depth of decimals. // Note that we discard the timezone information -- all times are translated to UTC. inst_string -> DateTime = "#inst" whitespace+ "\"" d:$( [0-9]*<4> "-" [0-2][0-9] "-" [0-3][0-9] "T" [0-2][0-9] ":" [0-5][0-9] ":" [0-6][0-9] ("." [0-9]+)? "Z" / (("+" / "-") [0-2][0-9] ":" [0-5][0-9]) ) "\"" {? DateTime::parse_from_rfc3339(d) .map(|t| t.with_timezone(&Utc)) .map_err(|_| "invalid datetime") // Oh, rustpeg. } inst_micros -> DateTime = "#instmicros" whitespace+ d:$( digit+ ) { let micros = d.parse::().unwrap(); let seconds: i64 = micros / 1000000; let nanos: u32 = ((micros % 1000000).abs() as u32) * 1000; Utc.timestamp(seconds, nanos) } inst_millis -> DateTime = "#instmillis" whitespace+ d:$( digit+ ) { let millis = d.parse::().unwrap(); let seconds: i64 = millis / 1000; let nanos: u32 = ((millis % 1000).abs() as u32) * 1000000; Utc.timestamp(seconds, nanos) } inst -> SpannedValue = t:(inst_millis / inst_micros / inst_string) { SpannedValue::Instant(t) } uuid_string -> Uuid = "\"" u:$( [a-f0-9]*<8> "-" [a-f0-9]*<4> "-" [a-f0-9]*<4> "-" [a-f0-9]*<4> "-" [a-f0-9]*<12> ) "\"" { Uuid::parse_str(u).expect("this is a valid UUID string") } pub uuid -> SpannedValue = "#uuid" whitespace+ u:uuid_string { SpannedValue::Uuid(u) } namespace_divider = "." namespace_separator = "/" // TODO: Be more picky here // Keywords follow the rules of symbols, except they can (and must) begin with : // e.g. :fred or :my/fred. See https://github.com/edn-format/edn#keywords symbol_char_initial = [a-zA-Z0-9*!_?$%&=<>] symbol_char_subsequent = [a-zA-Z0-9*!_?$%&=<>-] symbol_namespace = symbol_char_initial symbol_char_subsequent* (namespace_divider symbol_char_subsequent+)* symbol_name = ( symbol_char_initial+ symbol_char_subsequent* ) plain_symbol_name = symbol_name / "..." / "." keyword_prefix = ":" pub symbol -> SpannedValue = ns:( sns:$(symbol_namespace) namespace_separator { sns })? n:$(plain_symbol_name) { SpannedValue::from_symbol(ns, n) } pub keyword -> SpannedValue = keyword_prefix ns:( sns:$(symbol_namespace) namespace_separator { sns })? n:$(symbol_name) { SpannedValue::from_keyword(ns, n) } pub list -> SpannedValue = "(" __ v:(value)* __ ")" { SpannedValue::List(LinkedList::from_iter(v)) } pub vector -> SpannedValue = "[" __ v:(value)* __ "]" { SpannedValue::Vector(v) } pub set -> SpannedValue = "#{" __ v:(value)* __ "}" { SpannedValue::Set(BTreeSet::from_iter(v)) } pair -> (ValueAndSpan, ValueAndSpan) = k:(value) v:(value) { (k, v) } pub map -> SpannedValue = "{" __ v:(pair)* __ "}" { SpannedValue::Map(BTreeMap::from_iter(v)) } // It's important that float comes before integer or the parser assumes that // floats are integers and fails to parse pub value -> ValueAndSpan = __ start:#position v:(nil / nan / infinity / boolean / number / inst / uuid / text / keyword / symbol / list / vector / map / set) end:#position __ { ValueAndSpan { inner: v, span: Span::new(start, end) } } // Clojure (and thus EDN) regards commas as whitespace, and thus the two-element vectors [1 2] and // [1,,,,2] are equivalent, as are the maps {:a 1, :b 2} and {:a 1 :b 2}. whitespace = [ \r\n\t,] comment = ";" [^\r\n]* [\r\n]? __ = (whitespace / comment)*