From de4d58f61422f9e6c63984cb131441c542ce97d0 Mon Sep 17 00:00:00 2001 From: Richard Newman Date: Fri, 16 Jun 2017 13:31:47 -0700 Subject: [PATCH] Parse and algebrize simple aggregates. (#312) --- query-algebrizer/src/lib.rs | 12 +- query-parser/src/parse.rs | 9 + query-projector/src/lib.rs | 453 +++++++++++++++++++++++++--- query-projector/tests/aggregates.rs | 87 ++++++ query-sql/src/lib.rs | 69 ++++- query-translator/src/translate.rs | 23 +- query-translator/tests/translate.rs | 31 +- query/src/lib.rs | 7 +- tests/query.rs | 275 ++++++++++++++++- 9 files changed, 892 insertions(+), 74 deletions(-) create mode 100644 query-projector/tests/aggregates.rs diff --git a/query-algebrizer/src/lib.rs b/query-algebrizer/src/lib.rs index f81e55e8..412f1943 100644 --- a/query-algebrizer/src/lib.rs +++ b/query-algebrizer/src/lib.rs @@ -62,8 +62,13 @@ pub use types::{ pub struct AlgebraicQuery { default_source: SrcVar, pub find_spec: FindSpec, - has_aggregates: bool, + + /// A set of variables that the caller wishes to be used for grouping when aggregating. pub with: BTreeSet, + + /// A set of variables that must be projected in order for query features such as ordering + /// to work correctly. + pub named_projection: BTreeSet, pub order: Option>, pub limit: Limit, pub cc: clauses::ConjoiningClauses, @@ -187,15 +192,14 @@ pub fn algebrize_with_inputs(schema: &Schema, cc.prune_extracted_types(); let (order, extra_vars) = validate_and_simplify_order(&cc, parsed.order)?; - let with: BTreeSet = parsed.with.into_iter().chain(extra_vars.into_iter()).collect(); // This might leave us with an unused `:in` variable. let limit = if parsed.find_spec.is_unit_limited() { Limit::Fixed(1) } else { parsed.limit }; let q = AlgebraicQuery { default_source: parsed.default_source, find_spec: parsed.find_spec, - has_aggregates: false, // TODO: we don't parse them yet. - with: with, + with: parsed.with, + named_projection: extra_vars, order: order, limit: limit, cc: cc, diff --git a/query-parser/src/parse.rs b/query-parser/src/parse.rs index 79614105..1e59c8d7 100644 --- a/query-parser/src/parse.rs +++ b/query-parser/src/parse.rs @@ -38,6 +38,7 @@ use self::mentat_parser_utils::value_and_span::{ }; use self::mentat_query::{ + Aggregate, Binding, Direction, Element, @@ -270,6 +271,13 @@ def_parser!(Query, func, (QueryFunction, Vec), { (Query::query_function(), Query::arguments()) }); +def_parser!(Query, aggregate, Aggregate, { + seq().of_exactly((Query::query_function(), Query::arguments())) + .map(|(func, args)| Aggregate { + func, args, + }) +}); + /// A vector containing just a parenthesized filter expression. def_parser!(Where, pred, WhereClause, { // Accept either a nested list or a nested vector here: @@ -376,6 +384,7 @@ def_matches_plain_symbol!(Find, placeholder, "_"); def_parser!(Find, elem, Element, { Query::variable().map(Element::Variable) + .or(Query::aggregate().map(Element::Aggregate)) }); def_parser!(Find, find_scalar, FindSpec, { diff --git a/query-projector/src/lib.rs b/query-projector/src/lib.rs index f7e963f0..398f56de 100644 --- a/query-projector/src/lib.rs +++ b/query-projector/src/lib.rs @@ -19,7 +19,9 @@ extern crate mentat_query_algebrizer; extern crate mentat_query_sql; extern crate mentat_sql; +use std::collections::BTreeSet; use std::iter; + use rusqlite::{ Row, Rows, @@ -39,9 +41,12 @@ use mentat_db::{ }; use mentat_query::{ + Aggregate, Element, FindSpec, Limit, + PlainSymbol, + QueryFunction, Variable, }; @@ -49,11 +54,14 @@ use mentat_query_algebrizer::{ AlgebraicQuery, ColumnName, ConjoiningClauses, + QualifiedAlias, VariableColumn, }; use mentat_query_sql::{ ColumnOrExpression, + Expression, + GroupBy, Name, Projection, ProjectedColumn, @@ -64,6 +72,31 @@ error_chain! { Error, ErrorKind, ResultExt, Result; } + errors { + /// We're just not done yet. Message that the feature is recognized but not yet + /// implemented. + NotYetImplemented(t: String) { + description("not yet implemented") + display("not yet implemented: {}", t) + } + CannotProjectImpossibleBinding(op: SimpleAggregationOp) { + description("no possible types for variable in projection list") + display("no possible types for value provided to {:?}", op) + } + CannotApplyAggregateOperationToTypes(op: SimpleAggregationOp, types: ValueTypeSet) { + description("cannot apply projection operation to types") + display("cannot apply projection operation {:?} to types {:?}", op, types) + } + UnboundVariable(var: PlainSymbol) { + description("cannot project unbound variable") + display("cannot project unbound variable {:?}", var) + } + NoTypeAvailableForVariable(var: PlainSymbol) { + description("cannot find type for variable") + display("cannot find type for variable {:?}", var) + } + } + foreign_links { Rusqlite(rusqlite::Error); } @@ -161,17 +194,22 @@ impl TypedIndex { } } -fn candidate_column(cc: &ConjoiningClauses, var: &Variable) -> (ColumnOrExpression, Name) { +fn cc_column(cc: &ConjoiningClauses, var: &Variable) -> Result { + cc.column_bindings + .get(var) + .and_then(|cols| cols.get(0).cloned()) + .ok_or_else(|| ErrorKind::UnboundVariable(var.name()).into()) +} + +fn candidate_column(cc: &ConjoiningClauses, var: &Variable) -> Result<(ColumnOrExpression, Name)> { // Every variable should be bound by the top-level CC to at least // one column in the query. If that constraint is violated it's a // bug in our code, so it's appropriate to panic here. - let columns = cc.column_bindings - .get(var) - .expect(format!("Every variable should have a binding, but {:?} does not", var).as_str()); - - let qa = columns[0].clone(); - let name = VariableColumn::Variable(var.clone()).column_name(); - (ColumnOrExpression::Column(qa), name) + cc_column(cc, var) + .map(|qa| { + let name = VariableColumn::Variable(var.clone()).column_name(); + (ColumnOrExpression::Column(qa), name) + }) } fn candidate_type_column(cc: &ConjoiningClauses, var: &Variable) -> (ColumnOrExpression, Name) { @@ -183,21 +221,209 @@ fn candidate_type_column(cc: &ConjoiningClauses, var: &Variable) -> (ColumnOrExp } /// Return the projected column -- that is, a value or SQL column and an associated name -- for a -/// given variable. Also return the type, if known. +/// given variable. Also return the type. /// Callers are expected to determine whether to project a type tag as an additional SQL column. -pub fn projected_column_for_var(var: &Variable, cc: &ConjoiningClauses) -> (ProjectedColumn, Option) { +pub fn projected_column_for_var(var: &Variable, cc: &ConjoiningClauses) -> Result<(ProjectedColumn, ValueTypeSet)> { if let Some(value) = cc.bound_value(&var) { // If we already know the value, then our lives are easy. let tag = value.value_type(); let name = VariableColumn::Variable(var.clone()).column_name(); - (ProjectedColumn(ColumnOrExpression::Value(value.clone()), name), Some(tag)) + Ok((ProjectedColumn(ColumnOrExpression::Value(value.clone()), name), ValueTypeSet::of_one(tag))) } else { // If we don't, then the CC *must* have bound the variable. - let (column, name) = candidate_column(cc, var); - (ProjectedColumn(column, name), cc.known_type(var)) + let (column, name) = candidate_column(cc, var)?; + Ok((ProjectedColumn(column, name), cc.known_type_set(var))) } } +fn projected_column_for_simple_aggregate(simple: &SimpleAggregate, cc: &ConjoiningClauses) -> Result<(ProjectedColumn, ValueType)> { + let known_types = cc.known_type_set(&simple.var); + let return_type = simple.op.is_applicable_to_types(known_types)?; + let projected_column_or_expression = + if let Some(value) = cc.bound_value(&simple.var) { + // Oh, we already know the value! + if simple.use_static_value() { + // We can statically compute the aggregate result for some operators -- not count or + // sum, but avg/max/min are OK. + ColumnOrExpression::Value(value) + } else { + let expression = Expression::Unary { + sql_op: simple.op.to_sql(), + arg: ColumnOrExpression::Value(value), + }; + ColumnOrExpression::Expression(Box::new(expression), return_type) + } + } else { + // The common case: the values are bound during execution. + let columns = cc.column_bindings + .get(&simple.var) + .expect(format!("Every variable should have a binding, but {:?} does not", simple.var).as_str()); + let expression = Expression::Unary { + sql_op: simple.op.to_sql(), + arg: ColumnOrExpression::Column(columns[0].clone()), + }; + ColumnOrExpression::Expression(Box::new(expression), return_type) + }; + Ok((ProjectedColumn(projected_column_or_expression, simple.column_name()), return_type)) +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum SimpleAggregationOp { + Avg, + Count, + Max, + Min, + Sum, +} + +impl SimpleAggregationOp { + fn to_sql(&self) -> &'static str { + use SimpleAggregationOp::*; + match self { + &Avg => "avg", + &Count => "count", + &Max => "max", + &Min => "min", + &Sum => "sum", + } + } + + fn for_function(function: &QueryFunction) -> Option { + match function.0.plain_name() { + "avg" => Some(SimpleAggregationOp::Avg), + "count" => Some(SimpleAggregationOp::Count), + "max" => Some(SimpleAggregationOp::Max), + "min" => Some(SimpleAggregationOp::Min), + "sum" => Some(SimpleAggregationOp::Sum), + _ => None, + } + } + + /// With knowledge of the types to which a variable might be bound, + /// return a `Result` to determine whether this aggregation is suitable. + /// For example, it's valid to take the `Avg` of `{Double, Long}`, invalid + /// to take `Sum` of `{Instant}`, valid to take (lexicographic) `Max` of `{String}`, + /// but invalid to take `Max` of `{Uuid, String}`. + /// + /// The returned type is the type of the result of the aggregation. + fn is_applicable_to_types(&self, possibilities: ValueTypeSet) -> Result { + use SimpleAggregationOp::*; + if possibilities.is_empty() { + bail!(ErrorKind::CannotProjectImpossibleBinding(*self)) + } + + match self { + // One can always count results. + &Count => Ok(ValueType::Long), + + // Only numeric types can be averaged or summed. + &Avg => { + if possibilities.is_subset(&ValueTypeSet::of_numeric_types()) { + // The mean of a set of numeric values will always, for our purposes, be a double. + Ok(ValueType::Double) + } else { + bail!(ErrorKind::CannotApplyAggregateOperationToTypes(*self, possibilities)) + } + }, + &Sum => { + if possibilities.is_subset(&ValueTypeSet::of_numeric_types()) { + if possibilities.contains(ValueType::Double) { + Ok(ValueType::Double) + } else { + // TODO: BigInt. + Ok(ValueType::Long) + } + } else { + bail!(ErrorKind::CannotApplyAggregateOperationToTypes(*self, possibilities)) + } + }, + + &Max | &Min => { + if possibilities.is_unit() { + use ValueType::*; + let the_type = possibilities.exemplar().expect("a type"); + match the_type { + // These types are numerically ordered. + Double | Long | Instant => Ok(the_type), + + // Boolean: false < true. + Boolean => Ok(the_type), + + // String: lexicographic order. + String => Ok(the_type), + + // These types are unordered. + Keyword | Ref | Uuid => { + bail!(ErrorKind::CannotApplyAggregateOperationToTypes(*self, possibilities)) + }, + } + } else { + // It cannot be empty -- we checked. + // The only types that are valid to compare cross-type are numbers. + if possibilities.is_subset(&ValueTypeSet::of_numeric_types()) { + // Note that if the max/min is a Long, it will be returned as a Double! + if possibilities.contains(ValueType::Double) { + Ok(ValueType::Double) + } else { + // TODO: BigInt. + Ok(ValueType::Long) + } + } else { + bail!(ErrorKind::CannotApplyAggregateOperationToTypes(*self, possibilities)) + } + } + }, + } + } +} + +struct SimpleAggregate { + op: SimpleAggregationOp, + var: Variable, +} + +impl SimpleAggregate { + fn column_name(&self) -> Name { + format!("({} {})", self.op.to_sql(), self.var.name()) + } + + fn use_static_value(&self) -> bool { + use SimpleAggregationOp::*; + match self.op { + Avg | Max | Min => true, + Count | Sum => false, + } + } +} + +trait SimpleAggregation { + fn to_simple(&self) -> Option; +} + +impl SimpleAggregation for Aggregate { + fn to_simple(&self) -> Option { + if self.args.len() != 1 { + return None; + } + self.args[0] + .as_variable() + .and_then(|v| SimpleAggregationOp::for_function(&self.func) + .map(|op| SimpleAggregate { op, var: v.clone(), })) + } +} + +/// An internal temporary struct to pass between the projection 'walk' and the +/// resultant projector. +/// Projection accumulates three things: +/// - A SQL projection list. +/// - A collection of templates for the projector to use to extract values. +/// - A list of columns to use for grouping. Grouping is a property of the projection! +struct ProjectedElements { + sql_projection: Projection, + templates: Vec, + group_by: Option>, +} + /// Walk an iterator of `Element`s, collecting projector templates and columns. /// /// Returns a pair: the SQL projection (which should always be a `Projection::Columns`) @@ -213,26 +439,44 @@ pub fn projected_column_for_var(var: &Variable, cc: &ConjoiningClauses) -> (Proj fn project_elements<'a, I: IntoIterator>( count: usize, elements: I, - query: &AlgebraicQuery) -> Result<(Projection, Vec)> { + query: &AlgebraicQuery) -> Result { let mut cols = Vec::with_capacity(count); let mut i: i32 = 0; let mut templates = vec![]; - let mut with = query.with.clone(); + // "Query variables not in aggregate expressions will group the results and appear intact + // in the result." + // Compute the set of variables projected by the query, then subtract + // those used in aggregate expressions. This will be our GROUP BY clause. + // The GROUP BY clause should begin with any non-projected :with variables, in order, + // then the non-aggregated projected variables, in order. + + // Predetermined: + // extras: variables needed for ORDER BY. query.named_projection. + // with: variables to be used for grouping. query.with. + // + // Accumulated: + // variables: variables in the projection list. + // aggregated: variables used in aggregates. + + // Results: + // group_by: (with + variables) - aggregated + // extra_projection: (with + extras) - variables + + let mut aggregated = BTreeSet::new(); + let mut variables = BTreeSet::new(); for e in elements { match e { // Each time we come across a variable, we push a SQL column // into the SQL projection, aliased to the name of the variable, // and we push an annotated index into the projector. &Element::Variable(ref var) => { - // If we're projecting this, we don't need it in :with. - with.remove(var); + variables.insert(var.clone()); - let (projected_column, maybe_type) = projected_column_for_var(&var, &query.cc); + let (projected_column, type_set) = projected_column_for_var(&var, &query.cc)?; cols.push(projected_column); - if let Some(ty) = maybe_type { - let tag = ty.value_type_tag(); + if let Some(tag) = type_set.unique_type_code() { templates.push(TypedIndex::Known(i, tag)); i += 1; // We used one SQL column. } else { @@ -243,23 +487,119 @@ fn project_elements<'a, I: IntoIterator>( let (type_column, type_name) = candidate_type_column(&query.cc, &var); cols.push(ProjectedColumn(type_column, type_name)); } - } + }, + &Element::Aggregate(ref a) => { + if let Some(simple) = a.to_simple() { + aggregated.insert(simple.var.clone()); + + // When we encounter a simple aggregate -- one in which the aggregation can be + // implemented in SQL, on a single variable -- we just push the SQL aggregation op. + // We must ensure the following: + // - There's a column for the var. + // - The type of the var is known to be restricted to a sensible input set + // (not necessarily a single type, but e.g., all vals must be Double or Long). + // - The type set must be appropriate for the operation. E.g., `Sum` is not a + // meaningful operation on instants. + + let (projected_column, return_type) = projected_column_for_simple_aggregate(&simple, &query.cc)?; + cols.push(projected_column); + + // We might regret using the type tag here instead of the `ValueType`. + templates.push(TypedIndex::Known(i, return_type.value_type_tag())); + i += 1; + } else { + // TODO: complex aggregates. + bail!(ErrorKind::NotYetImplemented("complex aggregates".into())); + } + }, } } - for var in with { - // We need to collect these into the SQL column list, but they don't affect projection. - // If a variable is of a non-fixed type, also project the type tag column, so we don't - // accidentally unify across types when considering uniqueness! - let (column, name) = candidate_column(&query.cc, &var); + // Anything we're projecting, or that's part of an aggregate, doesn't need to be in GROUP BY. + // + // Anything used in ORDER BY (which we're given in `named_projection`) + // needs to be in the SQL column list so we can refer to it by name. + // + // They don't affect projection. + // + // If a variable is of a non-fixed type, also project the type tag column, so we don't + // accidentally unify across types when considering uniqueness! + // Similarly, the type tag needs to be grouped. + // extra_projection: extras - variables + for var in query.named_projection.iter() { + if variables.contains(var) { + continue; + } + + // If it's a fixed value, we need do nothing further. + if query.cc.is_value_bound(&var) { + continue; + } + + let (column, name) = candidate_column(&query.cc, &var)?; cols.push(ProjectedColumn(column, name)); - if query.cc.known_type(&var).is_none() { + + // We don't care if a column has a single _type_, we care if it has a single type _code_, + // because that's what we'll use if we're projecting. E.g., Long and Double. + // Single type implies single type code, and is cheaper, so we check that first. + let types = query.cc.known_type_set(&var); + if !types.has_unique_type_code() { let (type_column, type_name) = candidate_type_column(&query.cc, &var); cols.push(ProjectedColumn(type_column, type_name)); } } - Ok((Projection::Columns(cols), templates)) + if aggregated.is_empty() { + // We're done -- we never need to group unless we're aggregating. + return Ok(ProjectedElements { + sql_projection: Projection::Columns(cols), + templates, + group_by: None, + }); + } + + // group_by: (with + variables) - aggregated + let mut group_by_vars: BTreeSet = query.with.union(&variables).cloned().collect(); + for var in aggregated { + group_by_vars.remove(&var); + } + + // We never need to group by a constant. + for var in query.cc.value_bound_variables() { + group_by_vars.remove(&var); + } + + // Turn this collection of vars into a collection of columns from the query. + // Right now we don't allow grouping on anything but a variable bound in the query. + // TODO: also group by type tag. + let group_by = if group_by_vars.is_empty() { + None + } else { + let mut group_cols = Vec::with_capacity(2 * group_by_vars.len()); + + for var in group_by_vars.into_iter() { + let types = query.cc.known_type_set(&var); + if !types.has_unique_type_code() { + // Group by type then SQL value. + let type_col = query.cc + .extracted_types + .get(&var) + .cloned() + .map(GroupBy::QueryColumn) + .ok_or_else(|| ErrorKind::NoTypeAvailableForVariable(var.name().clone()))?; + group_cols.push(type_col); + } + let val_col = cc_column(&query.cc, &var).map(GroupBy::QueryColumn)?; + group_cols.push(val_col); + } + Some(group_cols) + }; + + Ok(ProjectedElements { + sql_projection: Projection::Columns(cols), + templates, + group_by, + }) } pub trait Projector { @@ -295,12 +635,13 @@ impl ScalarProjector { } } - fn combine(sql: Projection, mut templates: Vec) -> Result { - let template = templates.pop().expect("Expected a single template"); + fn combine(mut elements: ProjectedElements) -> Result { + let template = elements.templates.pop().expect("Expected a single template"); Ok(CombinedProjection { - sql_projection: sql, + sql_projection: elements.sql_projection, datalog_projector: Box::new(ScalarProjector::with_template(template)), distinct: false, + group_by_cols: elements.group_by, }) } } @@ -333,19 +674,22 @@ impl TupleProjector { // This is exactly the same as for rel. fn collect_bindings<'a, 'stmt>(&self, row: Row<'a, 'stmt>) -> Result> { - assert_eq!(row.column_count(), self.len as i32); + // gte 'cos we might be querying extra columns for ordering. + // The templates will take care of ignoring columns. + assert!(row.column_count() >= self.len as i32); self.templates .iter() .map(|ti| ti.lookup(&row)) .collect::>>() } - fn combine(column_count: usize, sql: Projection, templates: Vec) -> Result { - let p = TupleProjector::with_templates(column_count, templates); + fn combine(column_count: usize, elements: ProjectedElements) -> Result { + let p = TupleProjector::with_templates(column_count, elements.templates); Ok(CombinedProjection { - sql_projection: sql, + sql_projection: elements.sql_projection, datalog_projector: Box::new(p), distinct: false, + group_by_cols: elements.group_by, }) } } @@ -381,19 +725,22 @@ impl RelProjector { } fn collect_bindings<'a, 'stmt>(&self, row: Row<'a, 'stmt>) -> Result> { - assert_eq!(row.column_count(), self.len as i32); + // gte 'cos we might be querying extra columns for ordering. + // The templates will take care of ignoring columns. + assert!(row.column_count() >= self.len as i32); self.templates .iter() .map(|ti| ti.lookup(&row)) .collect::>>() } - fn combine(column_count: usize, sql: Projection, templates: Vec) -> Result { - let p = RelProjector::with_templates(column_count, templates); + fn combine(column_count: usize, elements: ProjectedElements) -> Result { + let p = RelProjector::with_templates(column_count, elements.templates); Ok(CombinedProjection { - sql_projection: sql, + sql_projection: elements.sql_projection, datalog_projector: Box::new(p), distinct: true, + group_by_cols: elements.group_by, }) } } @@ -423,12 +770,13 @@ impl CollProjector { } } - fn combine(sql: Projection, mut templates: Vec) -> Result { - let template = templates.pop().expect("Expected a single template"); + fn combine(mut elements: ProjectedElements) -> Result { + let template = elements.templates.pop().expect("Expected a single template"); Ok(CombinedProjection { - sql_projection: sql, + sql_projection: elements.sql_projection, datalog_projector: Box::new(CollProjector::with_template(template)), distinct: true, + group_by_cols: elements.group_by, }) } } @@ -458,6 +806,10 @@ pub struct CombinedProjection { /// True if this query requires the SQL query to include DISTINCT. pub distinct: bool, + + // An optional list of column names to use as a GROUP BY clause. + // Right now these are all `Name`s, and present in the `Projection`. + pub group_by_cols: Option>, } impl CombinedProjection { @@ -488,29 +840,30 @@ pub fn query_projection(query: &AlgebraicQuery) -> Result { sql_projection: Projection::One, datalog_projector: Box::new(constant_projector), distinct: false, + group_by_cols: None, // TODO }) } else { match query.find_spec { FindColl(ref element) => { - let (cols, templates) = project_elements(1, iter::once(element), query)?; - CollProjector::combine(cols, templates).map(|p| p.flip_distinct_for_limit(&query.limit)) + let e = project_elements(1, iter::once(element), query)?; + CollProjector::combine(e).map(|p| p.flip_distinct_for_limit(&query.limit)) }, FindScalar(ref element) => { - let (cols, templates) = project_elements(1, iter::once(element), query)?; - ScalarProjector::combine(cols, templates) + let e = project_elements(1, iter::once(element), query)?; + ScalarProjector::combine(e) }, FindRel(ref elements) => { let column_count = query.find_spec.expected_column_count(); - let (cols, templates) = project_elements(column_count, elements, query)?; - RelProjector::combine(column_count, cols, templates).map(|p| p.flip_distinct_for_limit(&query.limit)) + let e = project_elements(column_count, elements, query)?; + RelProjector::combine(column_count, e).map(|p| p.flip_distinct_for_limit(&query.limit)) }, FindTuple(ref elements) => { let column_count = query.find_spec.expected_column_count(); - let (cols, templates) = project_elements(column_count, elements, query)?; - TupleProjector::combine(column_count, cols, templates) + let e = project_elements(column_count, elements, query)?; + TupleProjector::combine(column_count, e) }, } } diff --git a/query-projector/tests/aggregates.rs b/query-projector/tests/aggregates.rs new file mode 100644 index 00000000..efe7900d --- /dev/null +++ b/query-projector/tests/aggregates.rs @@ -0,0 +1,87 @@ +// Copyright 2016 Mozilla +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use +// this file except in compliance with the License. You may obtain a copy of the +// License at http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +extern crate mentat_core; +extern crate mentat_query; +extern crate mentat_query_algebrizer; +extern crate mentat_query_parser; +extern crate mentat_query_projector; + +use mentat_core::{ + Attribute, + Entid, + Schema, + ValueType, +}; + +use mentat_query_parser::{ + parse_find_string, +}; + +use mentat_query::{ + NamespacedKeyword, +}; + +use mentat_query_algebrizer::{ + algebrize, +}; + +use mentat_query_projector::{ + query_projection, +}; + +// These are helpers that tests use to build Schema instances. +fn associate_ident(schema: &mut Schema, i: NamespacedKeyword, e: Entid) { + schema.entid_map.insert(e, i.clone()); + schema.ident_map.insert(i.clone(), e); +} + +fn add_attribute(schema: &mut Schema, e: Entid, a: Attribute) { + schema.schema_map.insert(e, a); +} + +fn prepopulated_schema() -> Schema { + let mut schema = Schema::default(); + associate_ident(&mut schema, NamespacedKeyword::new("foo", "name"), 65); + associate_ident(&mut schema, NamespacedKeyword::new("foo", "age"), 68); + associate_ident(&mut schema, NamespacedKeyword::new("foo", "height"), 69); + add_attribute(&mut schema, 65, Attribute { + value_type: ValueType::String, + multival: false, + ..Default::default() + }); + add_attribute(&mut schema, 68, Attribute { + value_type: ValueType::Long, + multival: false, + ..Default::default() + }); + add_attribute(&mut schema, 69, Attribute { + value_type: ValueType::Long, + multival: false, + ..Default::default() + }); + schema +} + +#[test] +fn test_aggregate_unsuitable_type() { + let schema = prepopulated_schema(); + + let query = r#"[:find (avg ?e) + :where + [?e :foo/age ?a]]"#; + + // While the query itself algebrizes and parses… + let parsed = parse_find_string(query).expect("query input to have parsed"); + let algebrized = algebrize(&schema, parsed).expect("query algebrizes"); + + // … when we look at the projection list, we cannot reconcile the types. + assert!(query_projection(&algebrized).is_err()); +} diff --git a/query-sql/src/lib.rs b/query-sql/src/lib.rs index f6edbfa8..8312436f 100644 --- a/query-sql/src/lib.rs +++ b/query-sql/src/lib.rs @@ -15,10 +15,10 @@ extern crate mentat_query_algebrizer; extern crate mentat_sql; use std::boxed::Box; - use mentat_core::{ Entid, TypedValue, + ValueType, }; use mentat_query::{ @@ -60,6 +60,11 @@ pub enum ColumnOrExpression { Integer(i32), // We use these for type codes etc. Long(i64), Value(TypedValue), + Expression(Box, ValueType), // Track the return type. +} + +pub enum Expression { + Unary { sql_op: &'static str, arg: ColumnOrExpression }, } /// `QueryValue` and `ColumnOrExpression` are almost identical… merge somehow? @@ -84,6 +89,26 @@ pub enum Projection { One, } +#[derive(PartialEq, Eq)] +pub enum GroupBy { + ProjectedColumn(Name), + QueryColumn(QualifiedAlias), + // TODO: non-projected expressions, etc. +} + +impl QueryFragment for GroupBy { + fn push_sql(&self, out: &mut QueryBuilder) -> BuildQueryResult { + match self { + &GroupBy::ProjectedColumn(ref name) => { + out.push_identifier(name.as_str()) + }, + &GroupBy::QueryColumn(ref qa) => { + qualified_alias_push_sql(out, qa) + }, + } + } +} + #[derive(Copy, Clone)] pub struct Op(pub &'static str); // TODO: we can do better than this! @@ -185,6 +210,7 @@ pub struct SelectQuery { pub projection: Projection, pub from: FromClause, pub constraints: Vec, + pub group_by: Option>, pub order: Vec, pub limit: Limit, } @@ -257,10 +283,8 @@ impl QueryFragment for ColumnOrExpression { fn push_sql(&self, out: &mut QueryBuilder) -> BuildQueryResult { use self::ColumnOrExpression::*; match self { - &Column(QualifiedAlias(ref table, ref column)) => { - out.push_identifier(table.as_str())?; - out.push_sql("."); - push_column(out, column) + &Column(ref qa) => { + qualified_alias_push_sql(out, qa) }, &Entid(entid) => { out.push_sql(entid.to_string().as_str()); @@ -277,6 +301,23 @@ impl QueryFragment for ColumnOrExpression { &Value(ref v) => { out.push_typed_value(v) }, + &Expression(ref e, _) => { + e.push_sql(out) + }, + } + } +} + +impl QueryFragment for Expression { + fn push_sql(&self, out: &mut QueryBuilder) -> BuildQueryResult { + match self { + &Expression::Unary { ref sql_op, ref arg } => { + out.push_sql(sql_op); // No need to escape built-ins. + out.push_sql("("); + arg.push_sql(out)?; + out.push_sql(")"); + Ok(()) + }, } } } @@ -379,6 +420,13 @@ impl QueryFragment for JoinOp { } } +// We don't own QualifiedAlias or QueryFragment, so we can't implement the trait. +fn qualified_alias_push_sql(out: &mut QueryBuilder, qa: &QualifiedAlias) -> BuildQueryResult { + out.push_identifier(qa.0.as_str())?; + out.push_sql("."); + push_column(out, &qa.1) +} + // We don't own SourceAlias or QueryFragment, so we can't implement the trait. fn source_alias_push_sql(out: &mut QueryBuilder, sa: &SourceAlias) -> BuildQueryResult { let &SourceAlias(ref table, ref alias) = sa; @@ -527,6 +575,16 @@ impl QueryFragment for SelectQuery { { out.push_sql(" AND ") }); } + match self.group_by { + Some(ref group_by) if !group_by.is_empty() => { + out.push_sql(" GROUP BY "); + interpose!(group, group_by, + { group.push_sql(out)? }, + { out.push_sql(", ") }); + }, + _ => {}, + } + if !self.order.is_empty() { out.push_sql(" ORDER BY "); interpose!(&OrderBy(ref dir, ref var), self.order, @@ -727,6 +785,7 @@ mod tests { right: ColumnOrExpression::Entid(65536), }, ], + group_by: None, order: vec![], limit: Limit::None, }; diff --git a/query-translator/src/translate.rs b/query-translator/src/translate.rs index 47019a1e..b6fe11f2 100644 --- a/query-translator/src/translate.rs +++ b/query-translator/src/translate.rs @@ -15,7 +15,9 @@ use mentat_core::{ ValueType, }; -use mentat_query::Limit; +use mentat_query::{ + Limit, +}; use mentat_query_algebrizer::{ AlgebraicQuery, @@ -47,6 +49,7 @@ use mentat_query_sql::{ ColumnOrExpression, Constraint, FromClause, + GroupBy, Op, ProjectedColumn, Projection, @@ -214,7 +217,8 @@ fn table_for_computed(computed: ComputedTable, alias: TableAlias) -> TableOrSubq // project it as the variable name. // E.g., SELECT datoms03.v AS `?x`. for var in projection.iter() { - let (projected_column, maybe_type) = projected_column_for_var(var, &cc); + // TODO: chain results out. + let (projected_column, type_set) = projected_column_for_var(var, &cc).expect("every var to be bound"); columns.push(projected_column); // Similarly, project type tags if they're not known conclusively in the @@ -222,10 +226,10 @@ fn table_for_computed(computed: ComputedTable, alias: TableAlias) -> TableOrSubq // Assumption: we'll never need to project a tag without projecting the value of a variable. if type_extraction.contains(var) { let expression = - if let Some(ty) = maybe_type { + if let Some(tag) = type_set.unique_type_code() { // If we know the type for sure, just project the constant. // SELECT datoms03.v AS `?x`, 10 AS `?x_value_type_tag` - ColumnOrExpression::Integer(ty.value_type_tag()) + ColumnOrExpression::Integer(tag) } else { // Otherwise, we'll have an established type binding! This'll be // either a datoms table or, recursively, a subquery. Project @@ -246,7 +250,7 @@ fn table_for_computed(computed: ComputedTable, alias: TableAlias) -> TableOrSubq // Each arm simply turns into a subquery. // The SQL translation will stuff "UNION" between each arm. let projection = Projection::Columns(columns); - cc_to_select_query(projection, cc, false, None, Limit::None) + cc_to_select_query(projection, cc, false, None, None, Limit::None) }).collect(), alias) }, @@ -268,6 +272,7 @@ fn table_for_computed(computed: ComputedTable, alias: TableAlias) -> TableOrSubq fn cc_to_select_query(projection: Projection, cc: ConjoiningClauses, distinct: bool, + group_by: Option>, order: Option>, limit: Limit) -> SelectQuery { let from = if cc.from.is_empty() { @@ -303,6 +308,7 @@ fn cc_to_select_query(projection: Projection, distinct: distinct, projection: projection, from: from, + group_by: group_by, constraints: cc.wheres .into_iter() .map(|c| c.to_constraint()) @@ -321,12 +327,13 @@ pub fn cc_to_exists(cc: ConjoiningClauses) -> SelectQuery { distinct: false, projection: Projection::One, from: FromClause::Nothing, + group_by: None, constraints: vec![], order: vec![], limit: Limit::None, } } else { - cc_to_select_query(Projection::One, cc, false, None, Limit::None) + cc_to_select_query(Projection::One, cc, false, None, None, Limit::None) } } @@ -335,9 +342,9 @@ pub fn cc_to_exists(cc: ConjoiningClauses) -> SelectQuery { pub fn query_to_select(query: AlgebraicQuery) -> Result { // TODO: we can't pass `query.limit` here if we aggregate during projection. // SQL-based aggregation -- `SELECT SUM(datoms00.e)` -- is fine. - let CombinedProjection { sql_projection, datalog_projector, distinct } = query_projection(&query)?; + let CombinedProjection { sql_projection, datalog_projector, distinct, group_by_cols } = query_projection(&query)?; Ok(ProjectedSelect { - query: cc_to_select_query(sql_projection, query.cc, distinct, query.order, query.limit), + query: cc_to_select_query(sql_projection, query.cc, distinct, group_by_cols, query.order, query.limit), projector: datalog_projector, }) } diff --git a/query-translator/tests/translate.rs b/query-translator/tests/translate.rs index 605a4e71..431d79c0 100644 --- a/query-translator/tests/translate.rs +++ b/query-translator/tests/translate.rs @@ -546,13 +546,13 @@ fn test_with_without_aggregate() { // Known type. let query = r#"[:find ?x :with ?y :where [?x :foo/bar ?y]]"#; let SQLQuery { sql, args } = translate(&schema, query); - assert_eq!(sql, "SELECT DISTINCT `datoms00`.e AS `?x`, `datoms00`.v AS `?y` FROM `datoms` AS `datoms00` WHERE `datoms00`.a = 99"); + assert_eq!(sql, "SELECT DISTINCT `datoms00`.e AS `?x` FROM `datoms` AS `datoms00` WHERE `datoms00`.a = 99"); assert_eq!(args, vec![]); // Unknown type. let query = r#"[:find ?x :with ?y :where [?x _ ?y]]"#; let SQLQuery { sql, args } = translate(&schema, query); - assert_eq!(sql, "SELECT DISTINCT `all_datoms00`.e AS `?x`, `all_datoms00`.v AS `?y`, `all_datoms00`.value_type_tag AS `?y_value_type_tag` FROM `all_datoms` AS `all_datoms00`"); + assert_eq!(sql, "SELECT DISTINCT `all_datoms00`.e AS `?x` FROM `all_datoms` AS `all_datoms00`"); assert_eq!(args, vec![]); } @@ -955,3 +955,30 @@ fn test_instant_range() { AND `datoms00`.v > 1497574601257000"); assert_eq!(args, vec![]); } + +#[test] +fn test_project_aggregates() { + let schema = prepopulated_typed_schema(ValueType::Long); + let query = r#"[:find ?e (max ?t) + :where + [?e :foo/bar ?t]]"#; + let SQLQuery { sql, args } = translate(&schema, query); + assert_eq!(sql, "SELECT DISTINCT `datoms00`.e AS `?e`, max(`datoms00`.v) AS `(max ?t)` \ + FROM \ + `datoms` AS `datoms00` \ + WHERE `datoms00`.a = 99 \ + GROUP BY `datoms00`.e"); + assert_eq!(args, vec![]); + + let query = r#"[:find (max ?t) + :with ?e + :where + [?e :foo/bar ?t]]"#; + let SQLQuery { sql, args } = translate(&schema, query); + assert_eq!(sql, "SELECT DISTINCT max(`datoms00`.v) AS `(max ?t)` \ + FROM \ + `datoms` AS `datoms00` \ + WHERE `datoms00`.a = 99 \ + GROUP BY `datoms00`.e"); + assert_eq!(args, vec![]); +} diff --git a/query/src/lib.rs b/query/src/lib.rs index 1dc7168b..d04ba2d0 100644 --- a/query/src/lib.rs +++ b/query/src/lib.rs @@ -428,17 +428,16 @@ pub struct Pull { } */ -/* +#[derive(Debug, Eq, PartialEq)] pub struct Aggregate { - pub fn_name: String, + pub func: QueryFunction, pub args: Vec, } -*/ #[derive(Debug, Eq, PartialEq)] pub enum Element { Variable(Variable), - // Aggregate(Aggregate), // TODO + Aggregate(Aggregate), // Pull(Pull), // TODO } diff --git a/tests/query.rs b/tests/query.rs index 7697d8b5..6255dc4d 100644 --- a/tests/query.rs +++ b/tests/query.rs @@ -361,7 +361,8 @@ fn test_fulltext() { _ => panic!("Unexpected results."), } }, - _ => panic!("Expected query to work."), + Result::Ok(r) => panic!("Unexpected results {:?}.", r), + Result::Err(e) => panic!("Expected query to work, got {:?}.", e), } let a = conn.transact(&mut c, r#"[[:db/add "a" :foo/term "talk"]]"#) @@ -454,3 +455,275 @@ fn test_instant_range_query() { _ => panic!("Expected query to work."), } } + +#[test] +fn test_aggregation_implicit_grouping() { + let mut c = new_connection("").expect("Couldn't open conn."); + let mut conn = Conn::connect(&mut c).expect("Couldn't open DB."); + + conn.transact(&mut c, r#"[ + [:db/add "a" :db/ident :foo/score] + [:db/add "a" :db/valueType :db.type/long] + [:db/add "a" :db/cardinality :db.cardinality/one] + [:db/add "b" :db/ident :foo/name] + [:db/add "b" :db/valueType :db.type/string] + [:db/add "b" :db/cardinality :db.cardinality/one] + [:db/add "c" :db/ident :foo/is-vegetarian] + [:db/add "c" :db/valueType :db.type/boolean] + [:db/add "c" :db/cardinality :db.cardinality/one] + [:db/add "d" :db/ident :foo/play] + [:db/add "d" :db/valueType :db.type/ref] + [:db/add "d" :db/cardinality :db.cardinality/many] + [:db/add "d" :db/unique :db.unique/value] + ]"#).unwrap(); + + let ids = conn.transact(&mut c, r#"[ + [:db/add "a" :foo/name "Alice"] + [:db/add "b" :foo/name "Beli"] + [:db/add "c" :foo/name "Carlos"] + [:db/add "d" :foo/name "Diana"] + [:db/add "a" :foo/is-vegetarian true] + [:db/add "b" :foo/is-vegetarian true] + [:db/add "c" :foo/is-vegetarian false] + [:db/add "d" :foo/is-vegetarian false] + [:db/add "aa" :foo/score 14] + [:db/add "ab" :foo/score 99] + [:db/add "ac" :foo/score 14] + [:db/add "ba" :foo/score 22] + [:db/add "bb" :foo/score 11] + [:db/add "ca" :foo/score 42] + [:db/add "da" :foo/score 5] + [:db/add "db" :foo/score 28] + [:db/add "d" :foo/play "da"] + [:db/add "d" :foo/play "db"] + [:db/add "a" :foo/play "aa"] + [:db/add "a" :foo/play "ab"] + [:db/add "a" :foo/play "ac"] + [:db/add "b" :foo/play "ba"] + [:db/add "b" :foo/play "bb"] + [:db/add "c" :foo/play "ca"] + ]"#).unwrap().tempids; + + // We can combine these aggregates. + let r = conn.q_once(&mut c, + r#"[:find ?x ?name (max ?score) (count ?score) (avg ?score) + :where + [?x :foo/name ?name] + [?x :foo/play ?game] + [?game :foo/score ?score] + ]"#, None); + match r { + Result::Ok(QueryResults::Rel(vals)) => { + assert_eq!(vals, + vec![ + vec![TypedValue::Ref(ids.get("a").cloned().unwrap()), + TypedValue::String("Alice".to_string().into()), + TypedValue::Long(99), + TypedValue::Long(3), + TypedValue::Double((127f64 / 3f64).into())], + vec![TypedValue::Ref(ids.get("b").cloned().unwrap()), + TypedValue::String("Beli".to_string().into()), + TypedValue::Long(22), + TypedValue::Long(2), + TypedValue::Double((33f64 / 2f64).into())], + vec![TypedValue::Ref(ids.get("c").cloned().unwrap()), + TypedValue::String("Carlos".to_string().into()), + TypedValue::Long(42), + TypedValue::Long(1), + TypedValue::Double(42f64.into())], + vec![TypedValue::Ref(ids.get("d").cloned().unwrap()), + TypedValue::String("Diana".to_string().into()), + TypedValue::Long(28), + TypedValue::Long(2), + TypedValue::Double((33f64 / 2f64).into())]]); + }, + Result::Ok(x) => panic!("Got unexpected results {:?}", x), + Result::Err(e) => panic!("Expected query to work: got {:?}", e), + } +} + +// TODO: this can't be phrased in Datalog! +/* +#[test] +fn test_corresponding_row_value_aggregation() { + + // Who's youngest, via min? + let r = conn.q_once(&mut c, + r#"[:find [?name (min ?age)] + :where + [?x :foo/age ?age] + [?x :foo/name ?name]]"#, None); + match r { + Result::Ok(QueryResults::Tuple(Some(vals))) => { + assert_eq!(vals, + vec![TypedValue::String("Alice".to_string().into()), + TypedValue::Long(14)]); + }, + _ => panic!("Expected query to work."), + } + + // Who's oldest, via max? + let r = conn.q_once(&mut c, + r#"[:find [?name (max ?age)] + :where + [?x :foo/age ?age] + [?x :foo/name ?name]]"#, None); + match r { + Result::Ok(QueryResults::Tuple(Some(vals))) => { + assert_eq!(vals, + vec![TypedValue::String("Carlos".to_string().into()), + TypedValue::Long(42)]); + }, + _ => panic!("Expected query to work."), + } +} +*/ + +#[test] +fn test_simple_aggregation() { + let mut c = new_connection("").expect("Couldn't open conn."); + let mut conn = Conn::connect(&mut c).expect("Couldn't open DB."); + + conn.transact(&mut c, r#"[ + [:db/add "a" :db/ident :foo/age] + [:db/add "a" :db/valueType :db.type/long] + [:db/add "a" :db/cardinality :db.cardinality/one] + [:db/add "b" :db/ident :foo/name] + [:db/add "b" :db/valueType :db.type/string] + [:db/add "b" :db/cardinality :db.cardinality/one] + [:db/add "c" :db/ident :foo/is-vegetarian] + [:db/add "c" :db/valueType :db.type/boolean] + [:db/add "c" :db/cardinality :db.cardinality/one] + ]"#).unwrap(); + + let ids = conn.transact(&mut c, r#"[ + [:db/add "a" :foo/name "Alice"] + [:db/add "b" :foo/name "Beli"] + [:db/add "c" :foo/name "Carlos"] + [:db/add "d" :foo/name "Diana"] + [:db/add "a" :foo/is-vegetarian true] + [:db/add "b" :foo/is-vegetarian true] + [:db/add "c" :foo/is-vegetarian false] + [:db/add "d" :foo/is-vegetarian false] + [:db/add "a" :foo/age 14] + [:db/add "b" :foo/age 22] + [:db/add "c" :foo/age 42] + [:db/add "d" :foo/age 28] + ]"#).unwrap().tempids; + + // What are the oldest and youngest ages? + let r = conn.q_once(&mut c, + r#"[:find [(min ?age) (max ?age)] + :where + [_ :foo/age ?age]]"#, None); + match r { + Result::Ok(QueryResults::Tuple(Some(vals))) => { + assert_eq!(vals, + vec![TypedValue::Long(14), + TypedValue::Long(42)]); + }, + _ => panic!("Expected query to work."), + } + + // Who's youngest, via order? + let r = conn.q_once(&mut c, + r#"[:find [?name ?age] + :order (asc ?age) + :where + [?x :foo/age ?age] + [?x :foo/name ?name]]"#, None); + match r { + Result::Ok(QueryResults::Tuple(Some(vals))) => { + assert_eq!(vals, + vec![TypedValue::String("Alice".to_string().into()), + TypedValue::Long(14)]); + }, + Result::Ok(r) => panic!("Unexpected results {:?}", r), + Result::Err(e) => panic!("Expected query to work, got {:?}", e), + } + + // Who's oldest, via order? + let r = conn.q_once(&mut c, + r#"[:find [?name ?age] + :order (desc ?age) + :where + [?x :foo/age ?age] + [?x :foo/name ?name]]"#, None); + match r { + Result::Ok(QueryResults::Tuple(Some(vals))) => { + assert_eq!(vals, + vec![TypedValue::String("Carlos".to_string().into()), + TypedValue::Long(42)]); + }, + _ => panic!("Expected query to work."), + } + + // What's the average age? + let r = conn.q_once(&mut c, + r#"[:find (avg ?age) . + :where + [_ :foo/age ?age]]"#, None); + match r { + Result::Ok(QueryResults::Scalar(Some(sum))) => { + assert_eq!(sum, TypedValue::Double(26.5f64.into())); + }, + _ => panic!("Expected query to work."), + } + + // What's the total age? + let r = conn.q_once(&mut c, + r#"[:find (sum ?age) . + :where + [_ :foo/age ?age]]"#, None); + match r { + Result::Ok(QueryResults::Scalar(Some(sum))) => { + assert_eq!(sum, TypedValue::Long(106)); + }, + _ => panic!("Expected query to work."), + } + + // How many distinct names are there? + let r = conn.q_once(&mut c, + r#"[:find (count ?name) . + :where + [_ :foo/name ?name]]"#, None); + match r { + Result::Ok(QueryResults::Scalar(Some(count))) => { + assert_eq!(count, TypedValue::Long(4)); + }, + _ => panic!("Expected query to work."), + } + + // We can use constraints, too. + // What's the average age of adults? + let r = conn.q_once(&mut c, + r#"[:find [(avg ?age) (count ?age)] + :where + [_ :foo/age ?age] + [(>= ?age 18)]]"#, None); + match r { + Result::Ok(QueryResults::Tuple(Some(vals))) => { + assert_eq!(vals, vec![TypedValue::Double((92f64 / 3f64).into()), + TypedValue::Long(3)]); + }, + Result::Ok(x) => panic!("Got unexpected results {:?}", x), + Result::Err(e) => panic!("Expected query to work: got {:?}", e), + } + + // Who's oldest, vegetarians or not? + let r = conn.q_once(&mut c, + r#"[:find ?veg (max ?age) + :where + [?p :foo/age ?age] + [?p :foo/is-vegetarian ?veg]]"#, None); + match r { + Result::Ok(QueryResults::Rel(vals)) => { + assert_eq!(vals, vec![ + vec![TypedValue::Boolean(false), TypedValue::Long(42)], + vec![TypedValue::Boolean(true), TypedValue::Long(22)], + ]); + }, + Result::Ok(x) => panic!("Got unexpected results {:?}", x), + Result::Err(e) => panic!("Expected query to work: got {:?}", e), + } +}