[query] Widen known_types correctly in complex or. (#424) r=nalexander

* Part 1: define ValueTypeSet.

We're going to use this instead of `HashSet<ValueType>` so that we can clearly express
the empty set and the set of all types, and also to encapsulate a switch to `EnumSet`."

* Part 2: use ValueTypeSet.

* Part 3: fix type expansion.

* Part 4: add a test for type extraction from nested `or`.

* Review comments.

* Review comments: simplify ValueTypeSet.
This commit is contained in:
Richard Newman 2017-04-24 14:15:26 -07:00 committed by GitHub
parent bc63744aba
commit 19fc7cddf1
10 changed files with 254 additions and 47 deletions

View file

@ -4,9 +4,10 @@ version = "0.0.1"
workspace = ".."
[dependencies]
enum-set = { git = "https://github.com/rnewman/enum-set" }
lazy_static = "0.2.2"
num = "0.1.35"
ordered-float = "0.4.0"
lazy_static = "0.2.2"
[dependencies.edn]
path = "../edn"

View file

@ -8,6 +8,8 @@
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
extern crate enum_set;
#[macro_use]
extern crate lazy_static;
extern crate ordered_float;
@ -19,6 +21,9 @@ pub mod values;
use std::collections::BTreeMap;
use std::fmt;
use std::rc::Rc;
use enum_set::EnumSet;
use self::ordered_float::OrderedFloat;
use self::edn::NamespacedKeyword;
@ -33,7 +38,8 @@ pub type Entid = i64;
/// The attribute of each Mentat assertion has a :db/valueType constraining the value to a
/// particular set. Mentat recognizes the following :db/valueType values.
#[derive(Clone,Copy,Debug,Eq,Hash,Ord,PartialOrd,PartialEq)]
#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialOrd, PartialEq)]
#[repr(u32)]
pub enum ValueType {
Ref,
Boolean,
@ -44,6 +50,32 @@ pub enum ValueType {
Keyword,
}
impl ValueType {
pub fn all_enums() -> EnumSet<ValueType> {
// TODO: lazy_static.
let mut s = EnumSet::new();
s.insert(ValueType::Ref);
s.insert(ValueType::Boolean);
s.insert(ValueType::Instant);
s.insert(ValueType::Long);
s.insert(ValueType::Double);
s.insert(ValueType::String);
s.insert(ValueType::Keyword);
s
}
}
impl enum_set::CLike for ValueType {
fn to_u32(&self) -> u32 {
*self as u32
}
unsafe fn from_u32(v: u32) -> ValueType {
std::mem::transmute(v)
}
}
impl ValueType {
pub fn to_edn_value(self) -> edn::Value {
match self {

View file

@ -4,6 +4,7 @@ version = "0.0.1"
workspace = ".."
[dependencies]
enum-set = { git = "https://github.com/rnewman/enum-set" }
error-chain = "0.8.1"
[dependencies.mentat_core]

View file

@ -13,7 +13,6 @@ use std::cmp;
use std::collections::{
BTreeMap,
BTreeSet,
HashSet,
};
use std::collections::btree_map::Entry;
@ -59,6 +58,7 @@ use types::{
QueryValue,
SourceAlias,
TableAlias,
ValueTypeSet,
};
mod inputs;
@ -82,12 +82,6 @@ impl<T: Clone> RcCloned<T> for ::std::rc::Rc<T> {
}
}
fn unit_type_set(t: ValueType) -> HashSet<ValueType> {
let mut s = HashSet::with_capacity(1);
s.insert(t);
s
}
trait Contains<K, T> {
fn when_contains<F: FnOnce() -> T>(&self, k: &K, f: F) -> Option<T>;
}
@ -201,10 +195,11 @@ pub struct ConjoiningClauses {
/// A map from var to type. Whenever a var maps unambiguously to two different types, it cannot
/// yield results, so we don't represent that case here. If a var isn't present in the map, it
/// means that its type is not known in advance.
pub known_types: BTreeMap<Variable, HashSet<ValueType>>,
/// Usually that state should be represented by `ValueTypeSet::Any`.
pub known_types: BTreeMap<Variable, ValueTypeSet>,
/// A mapping, similar to `column_bindings`, but used to pull type tags out of the store at runtime.
/// If a var isn't present in `known_types`, it should be present here.
/// If a var isn't unit in `known_types`, it should be present here.
pub extracted_types: BTreeMap<Variable, QualifiedAlias>,
}
@ -278,7 +273,7 @@ impl ConjoiningClauses {
// Pre-fill our type mappings with the types of the input bindings.
cc.known_types
.extend(types.iter()
.map(|(k, v)| (k.clone(), unit_type_set(*v))));
.map(|(k, v)| (k.clone(), ValueTypeSet::of_one(*v))));
cc
},
}
@ -330,7 +325,7 @@ impl ConjoiningClauses {
/// or integer" isn't good enough.
pub fn known_type(&self, var: &Variable) -> Option<ValueType> {
match self.known_types.get(var) {
Some(types) if types.len() == 1 => types.iter().next().cloned(),
Some(set) if set.is_unit() => set.exemplar(),
_ => None,
}
}
@ -443,16 +438,12 @@ impl ConjoiningClauses {
/// Mark the given value as a long.
pub fn constrain_var_to_long(&mut self, variable: Variable) {
self.narrow_types_for_var(variable, unit_type_set(ValueType::Long));
self.narrow_types_for_var(variable, ValueTypeSet::of_one(ValueType::Long));
}
/// Mark the given value as one of the set of numeric types.
fn constrain_var_to_numeric(&mut self, variable: Variable) {
let mut numeric_types = HashSet::with_capacity(2);
numeric_types.insert(ValueType::Double);
numeric_types.insert(ValueType::Long);
self.narrow_types_for_var(variable, numeric_types);
self.narrow_types_for_var(variable, ValueTypeSet::of_numeric_types());
}
/// Constrains the var if there's no existing type.
@ -462,9 +453,9 @@ impl ConjoiningClauses {
// Is there an existing mapping for this variable?
// Any known inputs have already been added to known_types, and so if they conflict we'll
// spot it here.
if let Some(existing) = self.known_types.insert(variable.clone(), unit_type_set(this_type)) {
if let Some(existing) = self.known_types.insert(variable.clone(), ValueTypeSet::of_one(this_type)) {
// There was an existing mapping. Does this type match?
if !existing.contains(&this_type) {
if !existing.contains(this_type) {
self.mark_known_empty(EmptyBecause::TypeMismatch(variable, existing, this_type));
}
}
@ -477,21 +468,28 @@ impl ConjoiningClauses {
/// actually move from a state in which a variable can have no type to one that can
/// yield results! We never do so at present -- we carefully set-union types before we
/// set-intersect them -- but this is worth bearing in mind.
pub fn broaden_types(&mut self, additional_types: BTreeMap<Variable, HashSet<ValueType>>) {
pub fn broaden_types(&mut self, additional_types: BTreeMap<Variable, ValueTypeSet>) {
for (var, new_types) in additional_types {
match self.known_types.entry(var) {
Entry::Vacant(e) => {
if new_types.len() == 1 {
if new_types.is_unit() {
self.extracted_types.remove(e.key());
}
e.insert(new_types);
},
Entry::Occupied(mut e) => {
if e.get().is_empty() && self.empty_because.is_some() {
panic!("Uh oh: we failed this pattern, probably because {:?} couldn't match, but now we're broadening its type.",
e.get());
let new;
// Scoped borrow of `e`.
{
let existing_types = e.get();
if existing_types.is_empty() && // The set is empty: no types are possible.
self.empty_because.is_some() {
panic!("Uh oh: we failed this pattern, probably because {:?} couldn't match, but now we're broadening its type.",
e.key());
}
new = existing_types.union(&new_types);
}
e.get_mut().extend(new_types.into_iter());
e.insert(new);
},
}
}
@ -501,7 +499,7 @@ impl ConjoiningClauses {
/// If no types are already known -- `var` could have any type -- then this is equivalent to
/// simply setting the known types to `types`.
/// If the known types don't intersect with `types`, mark the pattern as known-empty.
fn narrow_types_for_var(&mut self, var: Variable, types: HashSet<ValueType>) {
fn narrow_types_for_var(&mut self, var: Variable, types: ValueTypeSet) {
if types.is_empty() {
// We hope this never occurs; we should catch this case earlier.
self.mark_known_empty(EmptyBecause::NoValidTypes(var));
@ -515,10 +513,9 @@ impl ConjoiningClauses {
e.insert(types);
},
Entry::Occupied(mut e) => {
// TODO: we shouldn't need to clone here.
let intersected: HashSet<_> = types.intersection(e.get()).cloned().collect();
let intersected: ValueTypeSet = types.intersection(e.get());
if intersected.is_empty() {
let mismatching_type = types.iter().next().unwrap().clone();
let mismatching_type = types.exemplar().expect("types isn't none");
let reason = EmptyBecause::TypeMismatch(e.key().clone(),
e.get().clone(),
mismatching_type);
@ -536,7 +533,7 @@ impl ConjoiningClauses {
/// Restrict the sets of types for the provided vars to the provided types.
/// See `narrow_types_for_var`.
pub fn narrow_types(&mut self, additional_types: BTreeMap<Variable, HashSet<ValueType>>) {
pub fn narrow_types(&mut self, additional_types: BTreeMap<Variable, ValueTypeSet>) {
if additional_types.is_empty() {
return;
}
@ -623,7 +620,7 @@ impl ConjoiningClauses {
&PatternValuePlace::Variable(ref v) => {
// Do we know that this variable can't be a string? If so, we don't need
// AllDatoms. None or String means it could be or definitely is.
match self.known_types.get(v).map(|types| types.contains(&ValueType::String)) {
match self.known_types.get(v).map(|types| types.contains(ValueType::String)) {
Some(false) => DatomsTable::Datoms,
_ => DatomsTable::AllDatoms,
}
@ -769,7 +766,7 @@ impl ConjoiningClauses {
return;
}
for (var, types) in self.known_types.iter() {
if types.len() == 1 {
if types.is_unit() {
self.extracted_types.remove(var);
}
}

View file

@ -9,7 +9,10 @@
// specific language governing permissions and limitations under the License.
use std::collections::btree_map::Entry;
use std::collections::BTreeSet;
use std::collections::{
BTreeMap,
BTreeSet,
};
use mentat_core::{
Schema,
@ -41,6 +44,7 @@ use types::{
EmptyBecause,
QualifiedAlias,
SourceAlias,
ValueTypeSet,
VariableColumn,
};
@ -654,6 +658,21 @@ impl ConjoiningClauses {
type_associations = type_needed.iter().cloned().collect();
}
// Collect the new type information from the arms. There's some redundant work here --
// they already have all of the information from the parent.
// Note that we start with the first clause's type information.
{
let mut clauses = acc.iter();
let mut additional_types = clauses.next()
.expect("there to be at least one clause")
.known_types
.clone();
for cc in clauses {
union_types(&mut additional_types, &cc.known_types);
}
self.broaden_types(additional_types);
}
let union = ComputedTable::Union {
projection: projection,
type_extraction: type_needed,
@ -674,6 +693,43 @@ impl ConjoiningClauses {
}
}
/// Helper to fold together a set of type maps.
fn union_types(into: &mut BTreeMap<Variable, ValueTypeSet>,
additional_types: &BTreeMap<Variable, ValueTypeSet>) {
// We want the exclusive disjunction -- any variable not mentioned in both sets -- to default
// to ValueTypeSet::Any.
// This is necessary because we lazily populate known_types, so sometimes the type set will
// be missing a `ValueTypeSet::Any` for a variable, and we want to broaden rather than
// accidentally taking the other side's word for it!
// The alternative would be to exhaustively pre-fill `known_types` with all mentioned variables
// in the whole query, which is daunting.
let mut any: BTreeMap<Variable, ValueTypeSet>;
// Scoped borrow of `into`.
{
let i: BTreeSet<&Variable> = into.keys().collect();
let a: BTreeSet<&Variable> = additional_types.keys().collect();
any = i.symmetric_difference(&a)
.map(|v| ((*v).clone(), ValueTypeSet::any()))
.collect();
}
// Collect the additional types.
for (var, new_types) in additional_types {
match into.entry(var.clone()) {
Entry::Vacant(e) => {
e.insert(new_types.clone());
},
Entry::Occupied(mut e) => {
let new = e.get().union(&new_types);
e.insert(new);
},
}
}
// Blat in those that are disjoint.
into.append(&mut any);
}
#[cfg(test)]
mod testing {
extern crate mentat_query_parser;

View file

@ -293,7 +293,6 @@ mod testing {
add_attribute,
associate_ident,
ident,
unit_type_set,
};
use types::{
@ -303,6 +302,7 @@ mod testing {
QualifiedAlias,
QueryValue,
SourceAlias,
ValueTypeSet,
};
use algebrize;
@ -801,7 +801,7 @@ mod testing {
assert!(cc.is_known_empty());
assert_eq!(cc.empty_because.unwrap(),
EmptyBecause::TypeMismatch(y.clone(), unit_type_set(ValueType::String), ValueType::Boolean));
EmptyBecause::TypeMismatch(y.clone(), ValueTypeSet::of_one(ValueType::String), ValueType::Boolean));
}
#[test]
@ -839,7 +839,7 @@ mod testing {
assert!(cc.is_known_empty());
assert_eq!(cc.empty_because.unwrap(),
EmptyBecause::TypeMismatch(x.clone(), unit_type_set(ValueType::Ref), ValueType::Boolean));
EmptyBecause::TypeMismatch(x.clone(), ValueTypeSet::of_one(ValueType::Ref), ValueType::Boolean));
}
#[test]
@ -854,8 +854,8 @@ mod testing {
let e = Variable::from_valid_name("?e");
let v = Variable::from_valid_name("?v");
let cc = alg(&schema, query);
assert_eq!(cc.known_types.get(&e), Some(&unit_type_set(ValueType::Ref)));
assert_eq!(cc.known_types.get(&v), Some(&unit_type_set(ValueType::Boolean)));
assert_eq!(cc.known_types.get(&e), Some(&ValueTypeSet::of_one(ValueType::Ref)));
assert_eq!(cc.known_types.get(&v), Some(&ValueTypeSet::of_one(ValueType::Boolean)));
assert!(!cc.extracted_types.contains_key(&e));
assert!(!cc.extracted_types.contains_key(&v));
}

View file

@ -87,7 +87,6 @@ impl ConjoiningClauses {
mod testing {
use super::*;
use std::collections::HashSet;
use mentat_core::attribute::Unique;
use mentat_core::{
Attribute,
@ -115,6 +114,7 @@ mod testing {
ColumnConstraint,
EmptyBecause,
QueryValue,
ValueTypeSet,
};
#[test]
@ -159,7 +159,7 @@ mod testing {
// After processing those two clauses, we know that ?y must be numeric, but not exactly
// which type it must be.
assert_eq!(None, cc.known_type(&y)); // Not just one.
let expected: HashSet<ValueType> = vec![ValueType::Double, ValueType::Long].into_iter().collect();
let expected = ValueTypeSet::of_numeric_types();
assert_eq!(Some(&expected), cc.known_types.get(&y));
let clauses = cc.wheres;
@ -225,8 +225,7 @@ mod testing {
assert!(cc.is_known_empty());
assert_eq!(cc.empty_because.unwrap(),
EmptyBecause::TypeMismatch(y.clone(),
vec![ValueType::Double, ValueType::Long].into_iter()
.collect(),
ValueTypeSet::of_numeric_types(),
ValueType::String));
}
}

View file

@ -8,6 +8,8 @@
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
extern crate enum_set;
#[macro_use]
extern crate error_chain;

View file

@ -9,14 +9,17 @@
// specific language governing permissions and limitations under the License.
use std::collections::BTreeSet;
use std::collections::HashSet;
use std::fmt::{
Debug,
Formatter,
Result,
};
use enum_set::{
CLike,
EnumSet,
};
use mentat_core::{
Entid,
TypedValue,
@ -411,7 +414,7 @@ impl Debug for ColumnConstraint {
#[derive(PartialEq, Clone)]
pub enum EmptyBecause {
// Var, existing, desired.
TypeMismatch(Variable, HashSet<ValueType>, ValueType),
TypeMismatch(Variable, ValueTypeSet, ValueType),
NoValidTypes(Variable),
NonNumericArgument,
NonStringFulltextValue,
@ -461,4 +464,89 @@ impl Debug for EmptyBecause {
},
}
}
}
trait EnumSetExtensions<T: CLike + Clone> {
/// Return a set containing both `x` and `y`.
fn of_both(x: T, y: T) -> EnumSet<T>;
/// Return a clone of `self` with `y` added.
fn with(&self, y: T) -> EnumSet<T>;
}
impl<T: CLike + Clone> EnumSetExtensions<T> for EnumSet<T> {
/// Return a set containing both `x` and `y`.
fn of_both(x: T, y: T) -> Self {
let mut o = EnumSet::new();
o.insert(x);
o.insert(y);
o
}
/// Return a clone of `self` with `y` added.
fn with(&self, y: T) -> EnumSet<T> {
let mut o = self.clone();
o.insert(y);
o
}
}
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct ValueTypeSet(pub EnumSet<ValueType>);
impl Default for ValueTypeSet {
fn default() -> ValueTypeSet {
ValueTypeSet::any()
}
}
impl ValueTypeSet {
pub fn any() -> ValueTypeSet {
ValueTypeSet(ValueType::all_enums())
}
pub fn none() -> ValueTypeSet {
ValueTypeSet(EnumSet::new())
}
/// Return a set containing only `t`.
pub fn of_one(t: ValueType) -> ValueTypeSet {
let mut s = EnumSet::new();
s.insert(t);
ValueTypeSet(s)
}
/// Return a set containing `Double` and `Long`.
pub fn of_numeric_types() -> ValueTypeSet {
ValueTypeSet(EnumSet::of_both(ValueType::Double, ValueType::Long))
}
}
impl ValueTypeSet {
/// Returns a set containing all the types in this set and `other`.
pub fn union(&self, other: &ValueTypeSet) -> ValueTypeSet {
ValueTypeSet(self.0.union(other.0))
}
pub fn intersection(&self, other: &ValueTypeSet) -> ValueTypeSet {
ValueTypeSet(self.0.intersection(other.0))
}
/// Return an arbitrary type that's part of this set.
/// For a set containing a single type, this will be that type.
pub fn exemplar(&self) -> Option<ValueType> {
self.0.iter().next()
}
pub fn contains(&self, vt: ValueType) -> bool {
self.0.contains(&vt)
}
pub fn is_empty(&self) -> bool {
self.0.is_empty()
}
pub fn is_unit(&self) -> bool {
self.0.len() == 1
}
}

View file

@ -466,3 +466,34 @@ fn test_order_by() {
ORDER BY `?y_value_type_tag` ASC, `?y` ASC, `?x` ASC");
assert_eq!(args, vec![]);
}
#[test]
fn test_complex_nested_or_join_type_projection() {
let mut schema = Schema::default();
associate_ident(&mut schema, NamespacedKeyword::new("page", "title"), 98);
add_attribute(&mut schema, 98, Attribute {
value_type: ValueType::String,
..Default::default()
});
let input = r#"[:find [?y]
:where
(or
(or
[_ :page/title ?y])
(or
[_ :page/title ?y]))]"#;
let SQLQuery { sql, args } = translate(&schema, input);
assert_eq!(sql, "SELECT `c00`.`?y` AS `?y` \
FROM (SELECT `datoms00`.v AS `?y` \
FROM `datoms` AS `datoms00` \
WHERE `datoms00`.a = 98 \
UNION \
SELECT `datoms01`.v AS `?y` \
FROM `datoms` AS `datoms01` \
WHERE `datoms01`.a = 98) \
AS `c00` \
LIMIT 1");
assert_eq!(args, vec![]);
}