[query] Handle SQL NULL for aggregates over 0 rows. (#684) (#688) r=rnewman

This uses a `SELECT *` from an inner subselect to filter potentially `NULL` aggregates.

The alternative is to handle `NULL` values throughout the projector, which is simple but loses a valuable invariant: Mentat SQL queries produce values that are not `NULL`.
This commit is contained in:
Nick Alexander 2018-06-01 14:17:31 -07:00 committed by GitHub
parent 2a025916fe
commit d8d18a1731
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 283 additions and 14 deletions

View file

@ -330,7 +330,7 @@ impl Debug for QueryValue {
/// Represents an entry in the ORDER BY list: a variable or a variable's type tag. /// Represents an entry in the ORDER BY list: a variable or a variable's type tag.
/// (We require order vars to be projected, so we can simply use a variable here.) /// (We require order vars to be projected, so we can simply use a variable here.)
#[derive(Debug)] #[derive(Clone, Debug, Eq, PartialEq)]
pub struct OrderBy(pub Direction, pub VariableColumn); pub struct OrderBy(pub Direction, pub VariableColumn);
impl From<Order> for OrderBy { impl From<Order> for OrderBy {

View file

@ -164,6 +164,15 @@ impl SimpleAggregate {
Count | Sum => false, Count | Sum => false,
} }
} }
/// Return `true` if this aggregate can be `NULL` over 0 rows.
pub(crate) fn is_nullable(&self) -> bool {
use self::SimpleAggregationOp::*;
match self.op {
Avg | Max | Min => true,
Count | Sum => false,
}
}
} }
pub(crate) trait SimpleAggregation { pub(crate) trait SimpleAggregation {
@ -201,8 +210,12 @@ pub(crate) fn projected_column_for_simple_aggregate(simple: &SimpleAggregate, cc
sql_op: simple.op.to_sql(), sql_op: simple.op.to_sql(),
arg: ColumnOrExpression::Value(value), arg: ColumnOrExpression::Value(value),
}; };
if simple.is_nullable() {
ColumnOrExpression::NullableAggregate(Box::new(expression), return_type)
} else {
ColumnOrExpression::Expression(Box::new(expression), return_type) ColumnOrExpression::Expression(Box::new(expression), return_type)
} }
}
} else { } else {
// The common case: the values are bound during execution. // The common case: the values are bound during execution.
let name = VariableColumn::Variable(simple.var.clone()).column_name(); let name = VariableColumn::Variable(simple.var.clone()).column_name();
@ -210,7 +223,11 @@ pub(crate) fn projected_column_for_simple_aggregate(simple: &SimpleAggregate, cc
sql_op: simple.op.to_sql(), sql_op: simple.op.to_sql(),
arg: ColumnOrExpression::ExistingColumn(name), arg: ColumnOrExpression::ExistingColumn(name),
}; };
if simple.is_nullable() {
ColumnOrExpression::NullableAggregate(Box::new(expression), return_type)
} else {
ColumnOrExpression::Expression(Box::new(expression), return_type) ColumnOrExpression::Expression(Box::new(expression), return_type)
}
}; };
Ok((ProjectedColumn(projected_column_or_expression, simple.column_name()), return_type)) Ok((ProjectedColumn(projected_column_or_expression, simple.column_name()), return_type))
} }

View file

@ -62,6 +62,9 @@ pub enum ColumnOrExpression {
Integer(i32), // We use these for type codes etc. Integer(i32), // We use these for type codes etc.
Long(i64), Long(i64),
Value(TypedValue), Value(TypedValue),
// Some aggregates (`min`, `max`, `avg`) can be over 0 rows, and therefore can be `NULL`; that
// needs special treatment.
NullableAggregate(Box<Expression>, ValueType), // Track the return type.
Expression(Box<Expression>, ValueType), // Track the return type. Expression(Box<Expression>, ValueType), // Track the return type.
} }
@ -130,6 +133,12 @@ pub enum Constraint {
left: ColumnOrExpression, left: ColumnOrExpression,
list: Vec<ColumnOrExpression>, list: Vec<ColumnOrExpression>,
}, },
IsNull {
value: ColumnOrExpression,
},
IsNotNull {
value: ColumnOrExpression,
},
NotExists { NotExists {
subquery: TableOrSubquery, subquery: TableOrSubquery,
}, },
@ -278,6 +287,7 @@ impl QueryFragment for ColumnOrExpression {
&Value(ref v) => { &Value(ref v) => {
out.push_typed_value(v) out.push_typed_value(v)
}, },
&NullableAggregate(ref e, _) |
&Expression(ref e, _) => { &Expression(ref e, _) => {
e.push_sql(out) e.push_sql(out)
}, },
@ -343,6 +353,18 @@ impl QueryFragment for Constraint {
right.push_sql(out) right.push_sql(out)
}, },
&IsNull { ref value } => {
value.push_sql(out)?;
out.push_sql(" IS NULL");
Ok(())
},
&IsNotNull { ref value } => {
value.push_sql(out)?;
out.push_sql(" IS NOT NULL");
Ok(())
},
&And { ref constraints } => { &And { ref constraints } => {
// An empty intersection is true. // An empty intersection is true.
if constraints.is_empty() { if constraints.is_empty() {

View file

@ -427,7 +427,27 @@ fn re_project(mut inner: SelectQuery, projection: Projection) -> SelectQuery {
let limit = inner.limit; let limit = inner.limit;
inner.limit = Limit::None; inner.limit = Limit::None;
SelectQuery { use self::Projection::*;
let nullable = match &projection {
&Columns(ref columns) => {
columns.iter().filter_map(|pc| {
match pc {
&ProjectedColumn(ColumnOrExpression::NullableAggregate(_, _), ref name) => {
Some(Constraint::IsNotNull {
value: ColumnOrExpression::ExistingColumn(name.clone()),
})
},
_ => None,
}
}).collect()
},
&Star => vec![],
&One => vec![],
};
if nullable.is_empty() {
return SelectQuery {
distinct: outer_distinct, distinct: outer_distinct,
projection: projection, projection: projection,
from: FromClause::TableList(TableList(vec![TableOrSubquery::Subquery(Box::new(inner))])), from: FromClause::TableList(TableList(vec![TableOrSubquery::Subquery(Box::new(inner))])),
@ -435,6 +455,35 @@ fn re_project(mut inner: SelectQuery, projection: Projection) -> SelectQuery {
group_by: group_by, group_by: group_by,
order: order_by, order: order_by,
limit: limit, limit: limit,
};
}
// Our pattern is `SELECT * FROM (SELECT ...) WHERE (nullable aggregate) IS NOT NULL`. If
// there's an `ORDER BY` in the subselect, SQL does not guarantee that the outer select will
// respect that order. But `ORDER BY` is relevant to the subselect when we have a `LIMIT`.
// Thus we lift the `ORDER BY` if theres no `LIMIT` in the subselect, and repeat the `ORDER BY`
// if there is.
let subselect = SelectQuery {
distinct: outer_distinct,
projection: projection,
from: FromClause::TableList(TableList(vec![TableOrSubquery::Subquery(Box::new(inner))])),
constraints: vec![],
group_by: group_by,
order: match &limit {
&Limit::None => vec![],
&Limit::Fixed(_) | &Limit::Variable(_) => order_by.clone(),
},
limit,
};
SelectQuery {
distinct: false,
projection: Projection::Star,
from: FromClause::TableList(TableList(vec![TableOrSubquery::Subquery(Box::new(subselect))])),
constraints: nullable,
group_by: vec![],
order: order_by,
limit: Limit::None, // Any limiting comes from the internal query.
} }
} }

View file

@ -1091,14 +1091,17 @@ fn test_project_aggregates() {
let SQLQuery { sql, args } = translate(&schema, query); let SQLQuery { sql, args } = translate(&schema, query);
// No outer DISTINCT: we aggregate or group by every variable. // No outer DISTINCT: we aggregate or group by every variable.
assert_eq!(sql, "SELECT `?e` AS `?e`, max(`?t`) AS `(max ?t)` \ assert_eq!(sql, "SELECT * \
FROM \
(SELECT `?e` AS `?e`, max(`?t`) AS `(max ?t)` \
FROM \ FROM \
(SELECT DISTINCT \ (SELECT DISTINCT \
`datoms00`.e AS `?e`, \ `datoms00`.e AS `?e`, \
`datoms00`.v AS `?t` \ `datoms00`.v AS `?t` \
FROM `datoms` AS `datoms00` \ FROM `datoms` AS `datoms00` \
WHERE `datoms00`.a = 99) \ WHERE `datoms00`.a = 99) \
GROUP BY `?e`"); GROUP BY `?e`) \
WHERE `(max ?t)` IS NOT NULL");
assert_eq!(args, vec![]); assert_eq!(args, vec![]);
let query = r#"[:find (max ?t) let query = r#"[:find (max ?t)
@ -1106,7 +1109,75 @@ fn test_project_aggregates() {
:where :where
[?e :foo/bar ?t]]"#; [?e :foo/bar ?t]]"#;
let SQLQuery { sql, args } = translate(&schema, query); let SQLQuery { sql, args } = translate(&schema, query);
assert_eq!(sql, "SELECT max(`?t`) AS `(max ?t)` \ assert_eq!(sql, "SELECT * \
FROM \
(SELECT max(`?t`) AS `(max ?t)` \
FROM \
(SELECT DISTINCT \
`datoms00`.v AS `?t`, \
`datoms00`.e AS `?e` \
FROM `datoms` AS `datoms00` \
WHERE `datoms00`.a = 99)\
) \
WHERE `(max ?t)` IS NOT NULL");
assert_eq!(args, vec![]);
// ORDER BY lifted to outer query if there is no LIMIT.
let query = r#"[:find (max ?x)
:with ?e
:where
[?e ?a ?t]
[?t :foo/bar ?x]
:order ?a]"#;
let SQLQuery { sql, args } = translate(&schema, query);
assert_eq!(sql, "SELECT * \
FROM \
(SELECT max(`?x`) AS `(max ?x)`, `?a` AS `?a` \
FROM \
(SELECT DISTINCT \
`datoms01`.v AS `?x`, \
`datoms00`.a AS `?a`, \
`datoms00`.e AS `?e` \
FROM `datoms` AS `datoms00`, `datoms` AS `datoms01` \
WHERE `datoms01`.a = 99 AND `datoms00`.v = `datoms01`.e) \
GROUP BY `?a`) \
WHERE `(max ?x)` IS NOT NULL \
ORDER BY `?a` ASC");
assert_eq!(args, vec![]);
// ORDER BY duplicated in outer query if there is a LIMIT.
let query = r#"[:find (max ?x)
:with ?e
:where
[?e ?a ?t]
[?t :foo/bar ?x]
:order (desc ?a)
:limit 10]"#;
let SQLQuery { sql, args } = translate(&schema, query);
assert_eq!(sql, "SELECT * \
FROM \
(SELECT max(`?x`) AS `(max ?x)`, `?a` AS `?a` \
FROM \
(SELECT DISTINCT \
`datoms01`.v AS `?x`, \
`datoms00`.a AS `?a`, \
`datoms00`.e AS `?e` \
FROM `datoms` AS `datoms00`, `datoms` AS `datoms01` \
WHERE `datoms01`.a = 99 AND `datoms00`.v = `datoms01`.e) \
GROUP BY `?a` \
ORDER BY `?a` DESC \
LIMIT 10) \
WHERE `(max ?x)` IS NOT NULL \
ORDER BY `?a` DESC");
assert_eq!(args, vec![]);
// No outer SELECT * for non-nullable aggregates.
let query = r#"[:find (count ?t)
:with ?e
:where
[?e :foo/bar ?t]]"#;
let SQLQuery { sql, args } = translate(&schema, query);
assert_eq!(sql, "SELECT count(`?t`) AS `(count ?t)` \
FROM \ FROM \
(SELECT DISTINCT \ (SELECT DISTINCT \
`datoms00`.v AS `?t`, \ `datoms00`.v AS `?t`, \
@ -1116,6 +1187,28 @@ fn test_project_aggregates() {
assert_eq!(args, vec![]); assert_eq!(args, vec![]);
} }
#[test]
fn test_project_the() {
let schema = prepopulated_typed_schema(ValueType::Long);
let query = r#"[:find (the ?e) (max ?t)
:where
[?e :foo/bar ?t]]"#;
let SQLQuery { sql, args } = translate(&schema, query);
// We shouldn't NULL-check (the).
assert_eq!(sql, "SELECT * \
FROM \
(SELECT `?e` AS `?e`, max(`?t`) AS `(max ?t)` \
FROM \
(SELECT DISTINCT \
`datoms00`.e AS `?e`, \
`datoms00`.v AS `?t` \
FROM `datoms` AS `datoms00` \
WHERE `datoms00`.a = 99)) \
WHERE `(max ?t)` IS NOT NULL");
assert_eq!(args, vec![]);
}
#[test] #[test]
fn test_tx_before_and_after() { fn test_tx_before_and_after() {
let schema = prepopulated_typed_schema(ValueType::Long); let schema = prepopulated_typed_schema(ValueType::Long);

View file

@ -1193,6 +1193,94 @@ fn test_aggregate_the() {
TypedValue::typed_string("(1) Facebook")]].into()); TypedValue::typed_string("(1) Facebook")]].into());
} }
#[test]
fn test_null_aggregates() {
let store = Store::open("").expect("opened");
let rel =
store.q_once(r#"
[:find (count ?tx) (max ?txInstant)
:where [_ _ _ ?tx]
[?tx :db/txInstant ?txInstant]
[(< ?txInstant #inst "2016-01-01T11:00:00.000Z")]
]"#, None)
.into_rel_result()
.expect("no results");
// (count ?tx) is 0, but (max ?txInstant) is over 0 SQL rows, yielding a NULL in the SQL rows.
// We reject the entire row containing NULL aggregates.
assert_eq!(0, rel.row_count());
let rel_pull =
store.q_once(r#"
[:find (count ?tx) (max ?txInstant) (pull ?tx [*])
:where [_ _ _ ?tx]
[?tx :db/txInstant ?txInstant]
[(< ?txInstant #inst "2016-01-01T11:00:00.000Z")]
]"#, None)
.into_rel_result()
.expect("no results");
// Same logic as above -- just verifying that `RelTwoStagePullProjector` handles NULL.
assert_eq!(0, rel_pull.row_count());
let coll =
store.q_once(r#"
[:find [(max ?txInstant) ...]
:where [_ _ _ ?tx]
[?tx :db/txInstant ?txInstant]
[(< ?txInstant #inst "2016-01-01T11:00:00.000Z")]
]"#, None)
.into_coll_result()
.expect("no results");
// (max ?txInstant) is over 0 SQL rows, yielding a NULL in the SQL rows. We reject the entire
// row containing NULL aggregates, yielding an empty vector of results.
assert_eq!(coll, vec![]);
let tuple =
store.q_once(r#"
[:find [(count ?tx) (max ?txInstant)]
:where [_ _ _ ?tx]
[?tx :db/txInstant ?txInstant]
[(< ?txInstant #inst "2016-01-01T11:00:00.000Z")]
]"#, None)
.into_tuple_result()
.expect("no results");
// (count ?tx) is 0, but (max ?txInstant) is over 0 SQL rows, yielding a NULL in the SQL rows.
// We reject the entire row containing NULL aggregates, yielding no tuple result at all.
assert_eq!(tuple, None);
let tuple_pull =
store.q_once(r#"
[:find [(count ?tx) (max ?txInstant) (pull ?tx [*])]
:where [_ _ _ ?tx]
[?tx :db/txInstant ?txInstant]
[(< ?txInstant #inst "2016-01-01T11:00:00.000Z")]
]"#, None)
.into_tuple_result()
.expect("no results");
// Same logic as above -- just verifying that `CollTwoStagePullProjector` handles NULL.
assert_eq!(tuple_pull, None);
let scalar =
store.q_once(r#"
[:find (max ?txInstant) .
:where [_ _ _ ?tx]
[?tx :db/txInstant ?txInstant]
[(< ?txInstant #inst "2016-01-01T11:00:00.000Z")]
]"#, None)
.into_scalar_result()
.expect("no results");
// (max ?txInstant) is over 0 SQL rows, yielding a NULL in the SQL rows. We reject the entire
// row containing NULL aggregates, yielding no scalar result at all.
assert_eq!(scalar, None);
}
#[test] #[test]
fn test_aggregation_implicit_grouping() { fn test_aggregation_implicit_grouping() {
let mut store = Store::open("").expect("opened"); let mut store = Store::open("").expect("opened");