mentat/edn/src/edn.rustpeg

/* vim: set filetype=rust.rustpeg */

// Copyright 2016 Mozilla
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use
// this file except in compliance with the License. You may obtain a copy of the
// License at http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

use std::collections::{BTreeSet, BTreeMap, LinkedList};
use std::iter::FromIterator;
use std::f64::{NAN, INFINITY, NEG_INFINITY};

use chrono::{
    DateTime,
    TimeZone,
    Utc
};
use num::BigInt;
use ordered_float::OrderedFloat;
use uuid::Uuid;

use entities::*;
use symbols::*;
use types::{SpannedValue, Span, ValueAndSpan};

// Goal: Be able to parse https://github.com/edn-format/edn
// Also extensible to help parse http://docs.datomic.com/query.html

// Debugging hint: test using `cargo test --features peg/trace -- --nocapture`
// to trace where the parser is failing

// TODO: Support tagged elements
// TODO: Support discard

pub nil -> SpannedValue = "nil" { SpannedValue::Nil }
pub nan -> SpannedValue = "#f" whitespace+ "NaN" { SpannedValue::Float(OrderedFloat(NAN)) }

pub infinity -> SpannedValue = "#f" whitespace+ s:$(sign) "Infinity"
    { SpannedValue::Float(OrderedFloat(if s == "+" { INFINITY } else { NEG_INFINITY })) }

pub boolean -> SpannedValue
    = "true"  { SpannedValue::Boolean(true) }
    / "false" { SpannedValue::Boolean(false) }

digit = [0-9]
alphanumeric = [0-9a-zA-Z]
octaldigit = [0-7]
validbase = [3][0-6] / [12][0-9] / [2-9]
hex = [0-9a-fA-F]
sign = [+-]

pub raw_bigint -> BigInt = b:$( sign? digit+ ) "N"
    { b.parse::<BigInt>().unwrap() }
pub raw_octalinteger -> i64 = "0" i:$( octaldigit+ )
    { i64::from_str_radix(i, 8).unwrap() }
pub raw_hexinteger -> i64 = "0x" i:$( hex+ )
    { i64::from_str_radix(i, 16).unwrap() }
pub raw_basedinteger -> i64 = b:$( validbase ) "r" i:$( alphanumeric+ )
    { i64::from_str_radix(i, b.parse::<u32>().unwrap()).unwrap() }
pub raw_integer -> i64 = i:$( sign? digit+ ) !("." / ([eE]))
    { i.parse::<i64>().unwrap() }
pub raw_float -> OrderedFloat<f64> = f:$(sign? digit+ ("." digit+)? ([eE] sign? digit+)?)
    { OrderedFloat(f.parse::<f64>().unwrap()) }

pub bigint -> SpannedValue = v:raw_bigint { SpannedValue::BigInteger(v) }
pub octalinteger -> SpannedValue = v:raw_octalinteger { SpannedValue::Integer(v) }
pub hexinteger -> SpannedValue = v:raw_hexinteger { SpannedValue::Integer(v) }
pub basedinteger -> SpannedValue = v:raw_basedinteger { SpannedValue::Integer(v) }
pub integer -> SpannedValue = v:raw_integer { SpannedValue::Integer(v) }
pub float -> SpannedValue = v:raw_float { SpannedValue::Float(v) }

number -> SpannedValue = ( bigint / basedinteger / hexinteger / octalinteger / integer / float )

// TODO: standalone characters: \<char>, \newline, \return, \space and \tab.

string_special_char -> &'input str = "\\" $([\\"ntr])
string_normal_chars -> &'input str = $([^"\\]+)

// This is what we need to do in order to unescape. We can't just match the entire string slice:
// we get a Vec<&str> from rust-peg, where some of the parts might be unescaped special characters,
// and we join it together to form an output string.
// E.g., input = r#"\"foo\\\\bar\""#
//      output = [quote, "foo", backslash, "bar", quote]
//      result = r#""foo\\bar""#
// For the typical case, string_normal_chars will match multiple, leading to a single-element vec.
pub raw_text -> String = "\"" t:((string_special_char / string_normal_chars)*) "\""
    {  t.join(&"").to_string() }

pub text -> SpannedValue
    = v:raw_text { SpannedValue::Text(v) }

// RFC 3339 timestamps. #inst "1985-04-12T23:20:50.52Z"
// We accept an arbitrary depth of decimals.
// Note that we discard the timezone information -- all times are translated to UTC.
inst_string -> DateTime<Utc> =
    "#inst" whitespace+ "\"" d:$( [0-9]*<4> "-" [0-2][0-9] "-" [0-3][0-9]
              "T"
              [0-2][0-9] ":" [0-5][0-9] ":" [0-6][0-9]
              ("." [0-9]+)?
              ("Z" / (("+" / "-") [0-2][0-9] ":" [0-5][0-9]))
            )
    "\"" {?
        DateTime::parse_from_rfc3339(d)
            .map(|t| t.with_timezone(&Utc))
            .map_err(|_| "invalid datetime")        // Oh, rustpeg.
    }

inst_micros -> DateTime<Utc> =
    "#instmicros" whitespace+ d:$( digit+ ) {
        let micros = d.parse::<i64>().unwrap();
        let seconds: i64 = micros / 1000000;
        let nanos: u32 = ((micros % 1000000).abs() as u32) * 1000;
        Utc.timestamp(seconds, nanos)
    }

inst_millis -> DateTime<Utc> =
    "#instmillis" whitespace+ d:$( digit+ ) {
        let millis = d.parse::<i64>().unwrap();
        let seconds: i64 = millis / 1000;
        let nanos: u32 = ((millis % 1000).abs() as u32) * 1000000;
        Utc.timestamp(seconds, nanos)
    }

inst -> SpannedValue = t:(inst_millis / inst_micros / inst_string)
    { SpannedValue::Instant(t) }

uuid_string -> Uuid =
    "\"" u:$( [a-f0-9]*<8> "-" [a-f0-9]*<4> "-" [a-f0-9]*<4> "-" [a-f0-9]*<4> "-" [a-f0-9]*<12> ) "\"" {
        Uuid::parse_str(u).expect("this is a valid UUID string")
    }

pub uuid -> SpannedValue = "#uuid" whitespace+ u:uuid_string
    { SpannedValue::Uuid(u) }

namespace_divider = "."
namespace_separator = "/"

// TODO: Be more picky here
// Keywords follow the rules of symbols, except they can (and must) begin with :
// e.g. :fred or :my/fred. See https://github.com/edn-format/edn#keywords
symbol_char_initial = [a-zA-Z0-9*!_?$%&=<>]
symbol_char_subsequent = [a-zA-Z0-9*!_?$%&=<>-]

symbol_namespace = symbol_char_initial symbol_char_subsequent* (namespace_divider symbol_char_subsequent+)*
symbol_name = ( symbol_char_initial+ symbol_char_subsequent* )
plain_symbol_name = symbol_name / "..." / "."

keyword_prefix = ":"

pub symbol -> SpannedValue =
    ns:( sns:$(symbol_namespace) namespace_separator { sns })?
    n:$(plain_symbol_name)
    { SpannedValue::from_symbol(ns, n) }

pub keyword -> SpannedValue =
    keyword_prefix
    ns:( sns:$(symbol_namespace) namespace_separator { sns })?
    n:$(symbol_name)
    { SpannedValue::from_keyword(ns, n) }

pub list -> SpannedValue = "(" __ v:(value)* __ ")"
    { SpannedValue::List(LinkedList::from_iter(v)) }

pub vector -> SpannedValue = "[" __ v:(value)* __ "]"
    { SpannedValue::Vector(v) }

pub set -> SpannedValue = "#{" __ v:(value)* __ "}"
    { SpannedValue::Set(BTreeSet::from_iter(v)) }

pair -> (ValueAndSpan, ValueAndSpan) =
    k:(value) v:(value) {
        (k, v)
    }

pub map -> SpannedValue = "{" __ v:(pair)* __ "}"
    { SpannedValue::Map(BTreeMap::from_iter(v)) }

// It's important that float comes before integer or the parser assumes that
// floats are integers and fails to parse
pub value -> ValueAndSpan =
    __ start:#position v:(nil / nan / infinity / boolean / number / inst / uuid / text / keyword / symbol / list / vector / map / set) end:#position __ {
        ValueAndSpan {
            inner: v,
            span: Span::new(start, end)
        }
    }

atom -> ValueAndSpan
    = v:value {? if v.is_atom() { Ok(v) } else { Err("expected atom") } }

// Clojure (and thus EDN) regards commas as whitespace, and thus the two-element vectors [1 2] and
// [1,,,,2] are equivalent, as are the maps {:a 1, :b 2} and {:a 1 :b 2}.
whitespace = #quiet<[  \r\n\t,]>
comment = #quiet<";" [^\r\n]* [\r\n]?>

__ = (whitespace / comment)*

pub op -> OpType
    = ":db/add"     { OpType::Add }
    / ":db/retract" { OpType::Retract }

raw_keyword -> NamespacedKeyword
    = keyword_prefix ns:$(symbol_namespace) namespace_separator n:$(symbol_name) { NamespacedKeyword::new(ns, n) }

raw_forward_keyword -> NamespacedKeyword
    = v:raw_keyword {? if v.is_forward() { Ok(v) } else { Err("expected :forward/keyword") } }

raw_backward_keyword -> NamespacedKeyword
    = v:raw_keyword {? if v.is_backward() { Ok(v) } else { Err("expected :backward/_keyword") } }

entid -> Entid
    = v:( raw_basedinteger / raw_hexinteger / raw_octalinteger / raw_integer ) { Entid::Entid(v) }
    / v:raw_keyword { Entid::Ident(v) }

forward_entid -> Entid
    = v:( raw_basedinteger / raw_hexinteger / raw_octalinteger / raw_integer ) { Entid::Entid(v) }
    / v:raw_forward_keyword { Entid::Ident(v) }

backward_entid -> Entid
    = v:raw_backward_keyword { Entid::Ident(v.to_reversed()) }

lookup_ref -> LookupRef
    = "(" __ "lookup-ref" __ a:(entid) __ v:(value) __ ")" { LookupRef { a, v: v.without_spans() } }

tx_function -> TxFunction
    = "(" __ n:$(symbol_name) __ ")" { TxFunction { op: PlainSymbol::new(n) } }

entity_place -> EntidOrLookupRefOrTempId
    = v:raw_text { EntidOrLookupRefOrTempId::TempId(TempId::External(v)) }
    / v:entid { EntidOrLookupRefOrTempId::Entid(v) }
    / v:lookup_ref { EntidOrLookupRefOrTempId::LookupRef(v) }
    / v:tx_function { EntidOrLookupRefOrTempId::TxFunction(v) }

value_place_pair -> (Entid, AtomOrLookupRefOrVectorOrMapNotation)
    = k:(entid) __ v:(value_place) { (k, v) }

map_notation -> MapNotation
    = "{" __ kvs:(value_place_pair*) __ "}" { kvs.into_iter().collect() }

value_place -> AtomOrLookupRefOrVectorOrMapNotation
    = __ v:lookup_ref __ { AtomOrLookupRefOrVectorOrMapNotation::LookupRef(v) }
    / __ v:tx_function __ { AtomOrLookupRefOrVectorOrMapNotation::TxFunction(v) }
    / __ "[" __ vs:(value_place*) __ "]" __ { AtomOrLookupRefOrVectorOrMapNotation::Vector(vs) }
    / __ v:map_notation __ { AtomOrLookupRefOrVectorOrMapNotation::MapNotation(v) }
    / __ v:atom __ { AtomOrLookupRefOrVectorOrMapNotation::Atom(v) }

pub entity -> Entity
    = __ "[" __ op:(op) __ e:(entity_place) __ a:(forward_entid)  __ v:(value_place) __  "]" __ { Entity::AddOrRetract { op, e: e, a, v: v } }
    / __ "[" __ op:(op) __ e:(value_place)  __ a:(backward_entid) __ v:(entity_place) __ "]" __ { Entity::AddOrRetract { op, e: v, a, v: e } }
    / __ map:map_notation __ { Entity::MapNotation(map) }

pub entities -> Vec<Entity>
    = __ "[" __ es:(entity*) __ "]" __ { es }