[query] Handle SQL NULL for aggregates over 0 rows. (#684) (#688) r=rnewman

This uses a `SELECT *` from an inner subselect to filter potentially `NULL` aggregates.

The alternative is to handle `NULL` values throughout the projector, which is simple but loses a valuable invariant: Mentat SQL queries produce values that are not `NULL`.
This commit is contained in:
Nick Alexander 2018-06-01 14:17:31 -07:00 committed by GitHub
parent 2a025916fe
commit d8d18a1731
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 283 additions and 14 deletions

View file

@ -330,7 +330,7 @@ impl Debug for QueryValue {
/// Represents an entry in the ORDER BY list: a variable or a variable's type tag.
/// (We require order vars to be projected, so we can simply use a variable here.)
#[derive(Debug)]
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct OrderBy(pub Direction, pub VariableColumn);
impl From<Order> for OrderBy {

View file

@ -164,6 +164,15 @@ impl SimpleAggregate {
Count | Sum => false,
}
}
/// Return `true` if this aggregate can be `NULL` over 0 rows.
pub(crate) fn is_nullable(&self) -> bool {
use self::SimpleAggregationOp::*;
match self.op {
Avg | Max | Min => true,
Count | Sum => false,
}
}
}
pub(crate) trait SimpleAggregation {
@ -201,7 +210,11 @@ pub(crate) fn projected_column_for_simple_aggregate(simple: &SimpleAggregate, cc
sql_op: simple.op.to_sql(),
arg: ColumnOrExpression::Value(value),
};
ColumnOrExpression::Expression(Box::new(expression), return_type)
if simple.is_nullable() {
ColumnOrExpression::NullableAggregate(Box::new(expression), return_type)
} else {
ColumnOrExpression::Expression(Box::new(expression), return_type)
}
}
} else {
// The common case: the values are bound during execution.
@ -210,7 +223,11 @@ pub(crate) fn projected_column_for_simple_aggregate(simple: &SimpleAggregate, cc
sql_op: simple.op.to_sql(),
arg: ColumnOrExpression::ExistingColumn(name),
};
ColumnOrExpression::Expression(Box::new(expression), return_type)
if simple.is_nullable() {
ColumnOrExpression::NullableAggregate(Box::new(expression), return_type)
} else {
ColumnOrExpression::Expression(Box::new(expression), return_type)
}
};
Ok((ProjectedColumn(projected_column_or_expression, simple.column_name()), return_type))
}

View file

@ -62,7 +62,10 @@ pub enum ColumnOrExpression {
Integer(i32), // We use these for type codes etc.
Long(i64),
Value(TypedValue),
Expression(Box<Expression>, ValueType), // Track the return type.
// Some aggregates (`min`, `max`, `avg`) can be over 0 rows, and therefore can be `NULL`; that
// needs special treatment.
NullableAggregate(Box<Expression>, ValueType), // Track the return type.
Expression(Box<Expression>, ValueType), // Track the return type.
}
pub enum Expression {
@ -130,6 +133,12 @@ pub enum Constraint {
left: ColumnOrExpression,
list: Vec<ColumnOrExpression>,
},
IsNull {
value: ColumnOrExpression,
},
IsNotNull {
value: ColumnOrExpression,
},
NotExists {
subquery: TableOrSubquery,
},
@ -278,6 +287,7 @@ impl QueryFragment for ColumnOrExpression {
&Value(ref v) => {
out.push_typed_value(v)
},
&NullableAggregate(ref e, _) |
&Expression(ref e, _) => {
e.push_sql(out)
},
@ -343,6 +353,18 @@ impl QueryFragment for Constraint {
right.push_sql(out)
},
&IsNull { ref value } => {
value.push_sql(out)?;
out.push_sql(" IS NULL");
Ok(())
},
&IsNotNull { ref value } => {
value.push_sql(out)?;
out.push_sql(" IS NOT NULL");
Ok(())
},
&And { ref constraints } => {
// An empty intersection is true.
if constraints.is_empty() {

View file

@ -427,14 +427,63 @@ fn re_project(mut inner: SelectQuery, projection: Projection) -> SelectQuery {
let limit = inner.limit;
inner.limit = Limit::None;
SelectQuery {
use self::Projection::*;
let nullable = match &projection {
&Columns(ref columns) => {
columns.iter().filter_map(|pc| {
match pc {
&ProjectedColumn(ColumnOrExpression::NullableAggregate(_, _), ref name) => {
Some(Constraint::IsNotNull {
value: ColumnOrExpression::ExistingColumn(name.clone()),
})
},
_ => None,
}
}).collect()
},
&Star => vec![],
&One => vec![],
};
if nullable.is_empty() {
return SelectQuery {
distinct: outer_distinct,
projection: projection,
from: FromClause::TableList(TableList(vec![TableOrSubquery::Subquery(Box::new(inner))])),
constraints: vec![],
group_by: group_by,
order: order_by,
limit: limit,
};
}
// Our pattern is `SELECT * FROM (SELECT ...) WHERE (nullable aggregate) IS NOT NULL`. If
// there's an `ORDER BY` in the subselect, SQL does not guarantee that the outer select will
// respect that order. But `ORDER BY` is relevant to the subselect when we have a `LIMIT`.
// Thus we lift the `ORDER BY` if theres no `LIMIT` in the subselect, and repeat the `ORDER BY`
// if there is.
let subselect = SelectQuery {
distinct: outer_distinct,
projection: projection,
from: FromClause::TableList(TableList(vec![TableOrSubquery::Subquery(Box::new(inner))])),
constraints: vec![],
group_by: group_by,
order: match &limit {
&Limit::None => vec![],
&Limit::Fixed(_) | &Limit::Variable(_) => order_by.clone(),
},
limit,
};
SelectQuery {
distinct: false,
projection: Projection::Star,
from: FromClause::TableList(TableList(vec![TableOrSubquery::Subquery(Box::new(subselect))])),
constraints: nullable,
group_by: vec![],
order: order_by,
limit: limit,
limit: Limit::None, // Any limiting comes from the internal query.
}
}

View file

@ -1091,14 +1091,17 @@ fn test_project_aggregates() {
let SQLQuery { sql, args } = translate(&schema, query);
// No outer DISTINCT: we aggregate or group by every variable.
assert_eq!(sql, "SELECT `?e` AS `?e`, max(`?t`) AS `(max ?t)` \
assert_eq!(sql, "SELECT * \
FROM \
(SELECT DISTINCT \
`datoms00`.e AS `?e`, \
`datoms00`.v AS `?t` \
FROM `datoms` AS `datoms00` \
WHERE `datoms00`.a = 99) \
GROUP BY `?e`");
(SELECT `?e` AS `?e`, max(`?t`) AS `(max ?t)` \
FROM \
(SELECT DISTINCT \
`datoms00`.e AS `?e`, \
`datoms00`.v AS `?t` \
FROM `datoms` AS `datoms00` \
WHERE `datoms00`.a = 99) \
GROUP BY `?e`) \
WHERE `(max ?t)` IS NOT NULL");
assert_eq!(args, vec![]);
let query = r#"[:find (max ?t)
@ -1106,7 +1109,75 @@ fn test_project_aggregates() {
:where
[?e :foo/bar ?t]]"#;
let SQLQuery { sql, args } = translate(&schema, query);
assert_eq!(sql, "SELECT max(`?t`) AS `(max ?t)` \
assert_eq!(sql, "SELECT * \
FROM \
(SELECT max(`?t`) AS `(max ?t)` \
FROM \
(SELECT DISTINCT \
`datoms00`.v AS `?t`, \
`datoms00`.e AS `?e` \
FROM `datoms` AS `datoms00` \
WHERE `datoms00`.a = 99)\
) \
WHERE `(max ?t)` IS NOT NULL");
assert_eq!(args, vec![]);
// ORDER BY lifted to outer query if there is no LIMIT.
let query = r#"[:find (max ?x)
:with ?e
:where
[?e ?a ?t]
[?t :foo/bar ?x]
:order ?a]"#;
let SQLQuery { sql, args } = translate(&schema, query);
assert_eq!(sql, "SELECT * \
FROM \
(SELECT max(`?x`) AS `(max ?x)`, `?a` AS `?a` \
FROM \
(SELECT DISTINCT \
`datoms01`.v AS `?x`, \
`datoms00`.a AS `?a`, \
`datoms00`.e AS `?e` \
FROM `datoms` AS `datoms00`, `datoms` AS `datoms01` \
WHERE `datoms01`.a = 99 AND `datoms00`.v = `datoms01`.e) \
GROUP BY `?a`) \
WHERE `(max ?x)` IS NOT NULL \
ORDER BY `?a` ASC");
assert_eq!(args, vec![]);
// ORDER BY duplicated in outer query if there is a LIMIT.
let query = r#"[:find (max ?x)
:with ?e
:where
[?e ?a ?t]
[?t :foo/bar ?x]
:order (desc ?a)
:limit 10]"#;
let SQLQuery { sql, args } = translate(&schema, query);
assert_eq!(sql, "SELECT * \
FROM \
(SELECT max(`?x`) AS `(max ?x)`, `?a` AS `?a` \
FROM \
(SELECT DISTINCT \
`datoms01`.v AS `?x`, \
`datoms00`.a AS `?a`, \
`datoms00`.e AS `?e` \
FROM `datoms` AS `datoms00`, `datoms` AS `datoms01` \
WHERE `datoms01`.a = 99 AND `datoms00`.v = `datoms01`.e) \
GROUP BY `?a` \
ORDER BY `?a` DESC \
LIMIT 10) \
WHERE `(max ?x)` IS NOT NULL \
ORDER BY `?a` DESC");
assert_eq!(args, vec![]);
// No outer SELECT * for non-nullable aggregates.
let query = r#"[:find (count ?t)
:with ?e
:where
[?e :foo/bar ?t]]"#;
let SQLQuery { sql, args } = translate(&schema, query);
assert_eq!(sql, "SELECT count(`?t`) AS `(count ?t)` \
FROM \
(SELECT DISTINCT \
`datoms00`.v AS `?t`, \
@ -1116,6 +1187,28 @@ fn test_project_aggregates() {
assert_eq!(args, vec![]);
}
#[test]
fn test_project_the() {
let schema = prepopulated_typed_schema(ValueType::Long);
let query = r#"[:find (the ?e) (max ?t)
:where
[?e :foo/bar ?t]]"#;
let SQLQuery { sql, args } = translate(&schema, query);
// We shouldn't NULL-check (the).
assert_eq!(sql, "SELECT * \
FROM \
(SELECT `?e` AS `?e`, max(`?t`) AS `(max ?t)` \
FROM \
(SELECT DISTINCT \
`datoms00`.e AS `?e`, \
`datoms00`.v AS `?t` \
FROM `datoms` AS `datoms00` \
WHERE `datoms00`.a = 99)) \
WHERE `(max ?t)` IS NOT NULL");
assert_eq!(args, vec![]);
}
#[test]
fn test_tx_before_and_after() {
let schema = prepopulated_typed_schema(ValueType::Long);

View file

@ -1193,6 +1193,94 @@ fn test_aggregate_the() {
TypedValue::typed_string("(1) Facebook")]].into());
}
#[test]
fn test_null_aggregates() {
let store = Store::open("").expect("opened");
let rel =
store.q_once(r#"
[:find (count ?tx) (max ?txInstant)
:where [_ _ _ ?tx]
[?tx :db/txInstant ?txInstant]
[(< ?txInstant #inst "2016-01-01T11:00:00.000Z")]
]"#, None)
.into_rel_result()
.expect("no results");
// (count ?tx) is 0, but (max ?txInstant) is over 0 SQL rows, yielding a NULL in the SQL rows.
// We reject the entire row containing NULL aggregates.
assert_eq!(0, rel.row_count());
let rel_pull =
store.q_once(r#"
[:find (count ?tx) (max ?txInstant) (pull ?tx [*])
:where [_ _ _ ?tx]
[?tx :db/txInstant ?txInstant]
[(< ?txInstant #inst "2016-01-01T11:00:00.000Z")]
]"#, None)
.into_rel_result()
.expect("no results");
// Same logic as above -- just verifying that `RelTwoStagePullProjector` handles NULL.
assert_eq!(0, rel_pull.row_count());
let coll =
store.q_once(r#"
[:find [(max ?txInstant) ...]
:where [_ _ _ ?tx]
[?tx :db/txInstant ?txInstant]
[(< ?txInstant #inst "2016-01-01T11:00:00.000Z")]
]"#, None)
.into_coll_result()
.expect("no results");
// (max ?txInstant) is over 0 SQL rows, yielding a NULL in the SQL rows. We reject the entire
// row containing NULL aggregates, yielding an empty vector of results.
assert_eq!(coll, vec![]);
let tuple =
store.q_once(r#"
[:find [(count ?tx) (max ?txInstant)]
:where [_ _ _ ?tx]
[?tx :db/txInstant ?txInstant]
[(< ?txInstant #inst "2016-01-01T11:00:00.000Z")]
]"#, None)
.into_tuple_result()
.expect("no results");
// (count ?tx) is 0, but (max ?txInstant) is over 0 SQL rows, yielding a NULL in the SQL rows.
// We reject the entire row containing NULL aggregates, yielding no tuple result at all.
assert_eq!(tuple, None);
let tuple_pull =
store.q_once(r#"
[:find [(count ?tx) (max ?txInstant) (pull ?tx [*])]
:where [_ _ _ ?tx]
[?tx :db/txInstant ?txInstant]
[(< ?txInstant #inst "2016-01-01T11:00:00.000Z")]
]"#, None)
.into_tuple_result()
.expect("no results");
// Same logic as above -- just verifying that `CollTwoStagePullProjector` handles NULL.
assert_eq!(tuple_pull, None);
let scalar =
store.q_once(r#"
[:find (max ?txInstant) .
:where [_ _ _ ?tx]
[?tx :db/txInstant ?txInstant]
[(< ?txInstant #inst "2016-01-01T11:00:00.000Z")]
]"#, None)
.into_scalar_result()
.expect("no results");
// (max ?txInstant) is over 0 SQL rows, yielding a NULL in the SQL rows. We reject the entire
// row containing NULL aggregates, yielding no scalar result at all.
assert_eq!(scalar, None);
}
#[test]
fn test_aggregation_implicit_grouping() {
let mut store = Store::open("").expect("opened");