diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java index 477e7fca984c..c99da9b94c7d 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java @@ -101,7 +101,6 @@ public Double visitCall(RexCall call) { if (!deep) { return 1.0; } - /* * Ignore any predicates on partition columns because we have already * accounted for these in the Table row count. @@ -160,6 +159,33 @@ public Double visitCall(RexCall call) { break; } + case IS_NULL: { + if (childRel instanceof HiveTableScan) { + HiveTableScan hiveTableScan = (HiveTableScan) childRel; + if (hasMissingColumnStats(call, hiveTableScan)) { + selectivity = DEFAULT_COMPARISON_SELECTIVITY; + break; + } + double noOfNulls = getMaxNulls(call, hiveTableScan); + if (childCardinality >= noOfNulls) { + selectivity = noOfNulls / Math.max(childCardinality, 1); + } else { + HiveConfPlannerContext ctx = childRel.getCluster().getPlanner().getContext().unwrap(HiveConfPlannerContext.class); + String msg = "Invalid statistics: Number of null values > number of tuples. " + + "Consider recomputing statistics for table: " + + ((RelOptHiveTable) childRel.getTable()).getHiveTableMD().getFullyQualifiedName(); + if (ctx.isExplainPlan()) { + SessionState.getConsole().printError("WARNING: " + msg); + } + LOG.warn(msg); + selectivity = DEFAULT_COMPARISON_SELECTIVITY; + } + } else { + selectivity = computeFunctionSelectivity(call); + } + break; + } + case LESS_THAN_OR_EQUAL: case GREATER_THAN_OR_EQUAL: case LESS_THAN: @@ -191,7 +217,6 @@ public Double visitCall(RexCall call) { } selectivity = computeFunctionSelectivity(call); } - return selectivity; } @@ -839,6 +864,27 @@ private long getMaxNulls(RexCall call, HiveTableScan t) { return maxNoNulls; } + /** + * Returns true when one or more referenced columns do not have column statistics + * or getInputRefs returns empty + * Does not account for stale stats + */ + private boolean hasMissingColumnStats(RexCall call, HiveTableScan t) { + Set iRefSet = HiveCalciteUtil.getInputRefs(call); + if (iRefSet.isEmpty()) return true; + + List colStats = t.getColStat(new ArrayList(iRefSet)); + if (colStats.size() < iRefSet.size()) return true; + + for (ColStatistics cs : colStats) { + // Treat estimated stats as missing stats + if (cs == null || cs.isEstimated()) { + return true; + } + } + return false; + } + private Double getMaxNDV(RexCall call) { return getMaxNDV(call.getOperands()); } diff --git a/ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/stats/TestFilterSelectivityEstimator.java b/ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/stats/TestFilterSelectivityEstimator.java index 56e294a3fd0a..bd0ee80ee0df 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/stats/TestFilterSelectivityEstimator.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/stats/TestFilterSelectivityEstimator.java @@ -1116,6 +1116,32 @@ public void testBetweenWithCastToDecimal7s1() { checkBetweenSelectivity(0, universe, total, cast, 100f, 0f); } + @Test + public void testComputeIsNullSelectivityWithStats() { + stats.setNumNulls(3); + doReturn(Collections.singletonList(stats)).when(tableMock).getColStat(Collections.singletonList(0)); + RexNode filter = REX_BUILDER.makeCall(SqlStdOperatorTable.IS_NULL, inputRef0); + checkSelectivity(3f / VALUES.length, filter); // 3 / 13 + } + + @Test + public void testComputeIsNullSelectivityMissingStats() { + doReturn(Collections.emptyList()).when(tableMock).getColStat(Collections.singletonList(0)); + RexNode filter = REX_BUILDER.makeCall(SqlStdOperatorTable.IS_NULL, inputRef0); + checkSelectivity(1f / 3f, filter); // DEFAULT_COMPARISON_SELECTIVITY + } + + @Test + public void testComputeIsNullSelectivityEstimatedStatsFallback() { + ColStatistics estimated = new ColStatistics(); + estimated.setIsEstimated(true); + estimated.setNumNulls(0); + doReturn(Collections.singletonList(estimated)).when(tableMock).getColStat(Collections.singletonList(0)); + RexNode filter = REX_BUILDER.makeCall(SqlStdOperatorTable.IS_NULL, inputRef0); + // estimated stats should be treated as missing and fallback to DEFAULT_COMPARISON_SELECTIVITY + checkSelectivity(1f / 3f, filter); + } + private void checkSelectivity(float expectedSelectivity, RexNode filter) { FilterSelectivityEstimator estimator = new FilterSelectivityEstimator(scan, mq); Assert.assertEquals(filter.toString(), expectedSelectivity, estimator.estimateSelectivity(filter), DELTA);