mentat/edn/src/edn.rustpeg

/* vim: set filetype=rust.rustpeg */

// Copyright 2016 Mozilla
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use
// this file except in compliance with the License. You may obtain a copy of the
// License at http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

use std::collections::{BTreeSet, BTreeMap, LinkedList};
use std::iter::FromIterator;
use std::f64::{NAN, INFINITY, NEG_INFINITY};

use chrono::{
    DateTime,
    TimeZone,
    Utc
};
use num::BigInt;
use ordered_float::OrderedFloat;
use uuid::Uuid;

use types::{SpannedValue, Span, ValueAndSpan};

// Goal: Be able to parse https://github.com/edn-format/edn
// Also extensible to help parse http://docs.datomic.com/query.html

// Debugging hint: test using `cargo test --features peg/trace -- --nocapture`
// to trace where the parser is failing

// TODO: Support tagged elements
// TODO: Support discard

pub nil -> ValueAndSpan =
    start:#position "nil" end:#position {
        ValueAndSpan {
            inner: SpannedValue::Nil,
            span: Span::new(start, end)
        }
    }

pub nan -> ValueAndSpan =
    start:#position "#f" whitespace+ "NaN" end:#position {
        ValueAndSpan {
            inner: SpannedValue::Float(OrderedFloat(NAN)),
            span: Span::new(start, end)
        }
    }

pub infinity -> ValueAndSpan =
    start:#position "#f" whitespace+ s:$(sign) "Infinity" end:#position {
        ValueAndSpan {
            inner: SpannedValue::Float(OrderedFloat(if s == "+" { INFINITY } else { NEG_INFINITY })),
            span: Span::new(start, end)
        }
    }

pub boolean -> ValueAndSpan =
    start:#position "true" end:#position {
        ValueAndSpan {
            inner: SpannedValue::Boolean(true),
            span: Span::new(start, end)
        }
    } /
    start:#position "false" end:#position {
        ValueAndSpan {
            inner: SpannedValue::Boolean(false),
            span: Span::new(start, end)
        }
    }

digit = [0-9]
alphanumeric = [0-9a-zA-Z]
octaldigit = [0-7]
validbase = [3][0-6] / [12][0-9] / [2-9]
hex = [0-9a-fA-F]
sign = "-" / "+"

pub bigint -> ValueAndSpan =
    start:#position b:$( sign? digit+ ) "N" end:#position {
        ValueAndSpan {
            inner: SpannedValue::BigInteger(b.parse::<BigInt>().unwrap()),
            span: Span::new(start, end)
        }
    }

pub octalinteger -> ValueAndSpan =
    start:#position "0" i:$( octaldigit+ ) end:#position {
        ValueAndSpan {
            inner: SpannedValue::Integer(i64::from_str_radix(i, 8).unwrap()),
            span: Span::new(start, end)
        }
    }

pub hexinteger -> ValueAndSpan =
    start:#position "0x" i:$( hex+ ) end:#position {
        ValueAndSpan {
            inner: SpannedValue::Integer(i64::from_str_radix(i, 16).unwrap()),
            span: Span::new(start, end)
        }
    }

pub basedinteger -> ValueAndSpan =
    // Only allow values 2-36
    start:#position b:$( validbase ) "r" i:$( alphanumeric+ ) end:#position {
        ValueAndSpan {
            inner: SpannedValue::Integer(i64::from_str_radix(i, b.parse::<u32>().unwrap()).unwrap()),
            span: Span::new(start, end)
        }
    }

pub integer -> ValueAndSpan =
    start:#position i:$( sign? digit+ ) end:#position {
        ValueAndSpan {
            inner: SpannedValue::Integer(i.parse::<i64>().unwrap()),
            span: Span::new(start, end)
        }
    }

frac =     sign? digit+ "." digit+
exp =      sign? digit+            ("e" / "E") sign? digit+
frac_exp = sign? digit+ "." digit+ ("e" / "E") sign? digit+

// The order here is important - frac_exp must come before (exp / frac) or the
// parser assumes exp or frac when the float is really a frac_exp and fails
pub float -> ValueAndSpan =
    start:#position f:$( frac_exp / exp / frac ) end:#position {
        ValueAndSpan {
            inner: SpannedValue::Float(OrderedFloat(f.parse::<f64>().unwrap())),
            span: Span::new(start, end)
        }
    }

// TODO: \newline, \return, \space and \tab
special_char = quote / tab
quote = "\\\""
tab = "\\tab"
char = [^"] / special_char

pub text -> ValueAndSpan =
    start:#position "\"" t:$( char* ) "\"" end:#position {
        ValueAndSpan {
            inner: SpannedValue::Text(t.to_string()),
            span: Span::new(start, end)
        }
    }


// RFC 3339 timestamps. #inst "1985-04-12T23:20:50.52Z"
// We accept an arbitrary depth of decimals.
// Note that we discard the timezone information -- all times are translated to UTC.
pub inst_string -> DateTime<Utc> =
    "#inst" whitespace+ "\"" d:$( [0-9]*<4> "-" [0-2][0-9] "-" [0-3][0-9]
              "T"
              [0-2][0-9] ":" [0-5][0-9] ":" [0-6][0-9]
              ("." [0-9]+)?
              "Z" / (("+" / "-") [0-2][0-9] ":" [0-5][0-9])
            )
    "\"" {?
        DateTime::parse_from_rfc3339(d)
            .map(|t| t.with_timezone(&Utc))
            .map_err(|_| "invalid datetime")        // Oh, rustpeg.
    }

pub inst_micros -> DateTime<Utc> =
    "#instmicros" whitespace+ d:$( digit+ ) {
        let micros = d.parse::<i64>().unwrap();
        let seconds: i64 = micros / 1000000;
        let nanos: u32 = ((micros % 1000000).abs() as u32) * 1000;
        Utc.timestamp(seconds, nanos)
    }

pub inst_millis -> DateTime<Utc> =
    "#instmillis" whitespace+ d:$( digit+ ) {
        let millis = d.parse::<i64>().unwrap();
        let seconds: i64 = millis / 1000;
        let nanos: u32 = ((millis % 1000).abs() as u32) * 1000000;
        Utc.timestamp(seconds, nanos)
    }

pub inst -> ValueAndSpan =
    start:#position t:(inst_millis / inst_micros / inst_string) end:#position {
        ValueAndSpan {
            inner: SpannedValue::Instant(t),
            span: Span::new(start, end)
        }
    }

pub uuid_string -> Uuid =
    "\"" u:$( [a-f0-9]*<8> "-" [a-f0-9]*<4> "-" [a-f0-9]*<4> "-" [a-f0-9]*<4> "-" [a-f0-9]*<12> ) "\"" {
        Uuid::parse_str(u).expect("this is a valid UUID string")
    }

pub uuid -> ValueAndSpan =
    start:#position "#uuid" whitespace+ u:(uuid_string) end:#position {
        ValueAndSpan {
            inner: SpannedValue::Uuid(u),
            span: Span::new(start, end)
        }
    }

namespace_divider = "."
namespace_separator = "/"

// TODO: Be more picky here
// Keywords follow the rules of symbols, except they can (and must) begin with :
// e.g. :fred or :my/fred. See https://github.com/edn-format/edn#keywords
symbol_char_initial = [a-z] / [A-Z] / [0-9] / [*!_?$%&=<>]
symbol_char_subsequent = [a-z] / [A-Z] / [0-9] / [-*!_?$%&=<>]

symbol_namespace = symbol_char_initial symbol_char_subsequent* (namespace_divider symbol_char_subsequent+)*
symbol_name = ( symbol_char_initial+ symbol_char_subsequent* )
plain_symbol_name = symbol_name / "..." / "."

keyword_prefix = ":"

pub symbol -> ValueAndSpan =
    start:#position
    ns:( sns:$(symbol_namespace) namespace_separator { sns })?
    n:$(plain_symbol_name)
    end:#position {
        ValueAndSpan {
            inner: SpannedValue::from_symbol(ns, n),
            span: Span::new(start, end)
        }
    }

pub keyword -> ValueAndSpan =
    start:#position
    keyword_prefix
    ns:( sns:$(symbol_namespace) namespace_separator { sns })?
    n:$(symbol_name)
    end:#position {
        ValueAndSpan {
            inner: SpannedValue::from_keyword(ns, n),
            span: Span::new(start, end)
        }
    }

pub list -> ValueAndSpan =
    start:#position "(" __ v:(value)* __ ")" end:#position {
        ValueAndSpan {
            inner: SpannedValue::List(LinkedList::from_iter(v)),
            span: Span::new(start, end)
        }
    }

pub vector -> ValueAndSpan =
    start:#position "[" __ v:(value)* __ "]" end:#position {
        ValueAndSpan {
            inner: SpannedValue::Vector(v),
            span: Span::new(start, end)
        }
    }

pub set -> ValueAndSpan =
    start:#position "#{" __ v:(value)* __ "}" end:#position {
        ValueAndSpan {
            inner: SpannedValue::Set(BTreeSet::from_iter(v)),
            span: Span::new(start, end)
        }
    }

pair -> (ValueAndSpan, ValueAndSpan) =
    k:(value) v:(value) {
        (k, v)
    }

pub map -> ValueAndSpan =
    start:#position "{" __ v:(pair)* __ "}" end:#position {
        ValueAndSpan {
            inner: SpannedValue::Map(BTreeMap::from_iter(v)),
            span: Span::new(start, end)
        }
    }

// It's important that float comes before integer or the parser assumes that
// floats are integers and fails to parse
pub value -> ValueAndSpan =
    __ v:(nil / nan / infinity / boolean / float / octalinteger / hexinteger / basedinteger / inst / uuid / bigint / integer / text / keyword / symbol / list / vector / map / set) __ {
        v
    }

// Clojure (and thus EDN) regards commas as whitespace, and thus the two-element vectors [1 2] and
// [1,,,,2] are equivalent, as are the maps {:a 1, :b 2} and {:a 1 :b 2}.
whitespace = (" " / "\r" / "\n" / "\t" / ",")
comment = ";" [^\r\n]* ("\r" / "\n")?

__ = (whitespace / comment)*