Skip to content
This repository was archived by the owner on Sep 27, 2019. It is now read-only.

Optimizer refactor and cost model additions #1484

Merged
merged 23 commits into from
Feb 13, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions src/expression/abstract_expression.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,13 @@ bool AbstractExpression::operator==(const AbstractExpression &rhs) const {
if (exp_type_ != rhs.exp_type_ || children_.size() != rhs.children_.size())
return false;

// TODO: Try sorting the children
// TODO: Extend this to other comparison predicates
if (exp_type_ == ExpressionType::COMPARE_EQUAL && children_.size() == 2 && rhs.children_.size() == 2) {
return (*children_[0] == *rhs.children_[0] && *children_[1] == *rhs.children_[1]) ||
(*children_[0] == *rhs.children_[1] && *children_[1] == *rhs.children_[0]);
}

for (unsigned i = 0; i < children_.size(); i++) {
if (*children_[i].get() != *rhs.children_[i].get()) return false;
}
Expand Down
63 changes: 0 additions & 63 deletions src/include/optimizer/cost_calculator.h

This file was deleted.

43 changes: 43 additions & 0 deletions src/include/optimizer/cost_model/abstract_cost_model.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
//===----------------------------------------------------------------------===//
//
// Peloton
//
// abstract_cost_calculator.h
//
// Identification: src/include/optimizer/abstract_cost_calculator.h
//
// Copyright (c) 2015-2018, Carnegie Mellon University Database Group
//
//===----------------------------------------------------------------------===//

#pragma once

#include "optimizer/operator_visitor.h"

namespace peloton {
namespace optimizer {

class Memo;

// Default cost when cost model cannot compute correct cost.
static constexpr double DEFAULT_COST = 1;

// Estimate the cost of processing each row during a query.
static constexpr double DEFAULT_TUPLE_COST = 0.01;

// Estimate the cost of processing each index entry during an index scan.
static constexpr double DEFAULT_INDEX_TUPLE_COST = 0.005;

// Estimate the cost of processing each operator or function executed during a
// query.
static constexpr double DEFAULT_OPERATOR_COST = 0.0025;

class AbstractCostModel : public OperatorVisitor {
public:
virtual double CalculateCost(GroupExpression *gexpr, Memo *memo,
concurrency::TransactionContext *txn) = 0;
};

} // namespace optimizer
} // namespace peloton

161 changes: 161 additions & 0 deletions src/include/optimizer/cost_model/default_cost_model.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
//===----------------------------------------------------------------------===//
//
// Peloton
//
// cost_calculator.h
//
// Identification: src/include/optimizer/cost_calculator.h
//
// Copyright (c) 2015-2018, Carnegie Mellon University Database Group
//
//===----------------------------------------------------------------------===//

#pragma once

#include "optimizer/cost_model/abstract_cost_model.h"
#include "expression/tuple_value_expression.h"
#include "catalog/table_catalog.h"
#include "optimizer/memo.h"
#include "optimizer/operators.h"
#include "optimizer/stats/stats_storage.h"
#include "optimizer/stats/table_stats.h"

namespace peloton {
namespace optimizer {

class Memo;
// Derive cost for a physical group expression
class DefaultCostModel : public AbstractCostModel {
public:
DefaultCostModel(){};

double CalculateCost(GroupExpression *gexpr, Memo *memo,
concurrency::TransactionContext *txn) {
gexpr_ = gexpr;
memo_ = memo;
txn_ = txn;
gexpr_->Op().Accept(this);
return output_cost_;
}

void Visit(UNUSED_ATTRIBUTE const DummyScan *op) {
output_cost_ = 0.f;
}
void Visit(const PhysicalSeqScan *op) {
auto table_stats = std::dynamic_pointer_cast<TableStats>(
StatsStorage::GetInstance()->GetTableStats(
op->table_->GetDatabaseOid(), op->table_->GetTableOid(), txn_));
if (table_stats->GetColumnCount() == 0) {
output_cost_ = 1.f;
return;
}
output_cost_ = table_stats->num_rows * DEFAULT_TUPLE_COST;
}
void Visit(UNUSED_ATTRIBUTE const PhysicalIndexScan *op) {
auto table_stats = std::dynamic_pointer_cast<TableStats>(
StatsStorage::GetInstance()->GetTableStats(
op->table_->GetDatabaseOid(), op->table_->GetTableOid(), txn_));
if (table_stats->GetColumnCount() == 0 || table_stats->num_rows == 0) {
output_cost_ = 0.f;
return;
}
// Index search cost + scan cost
output_cost_ = std::log2(table_stats->num_rows) * DEFAULT_INDEX_TUPLE_COST +
memo_->GetGroupByID(gexpr_->GetGroupID())->GetNumRows() *
DEFAULT_TUPLE_COST;
}

void Visit(UNUSED_ATTRIBUTE const QueryDerivedScan *op) {
output_cost_ = 0.f;
}

void Visit(const PhysicalOrderBy *) { SortCost(); }

void Visit(const PhysicalLimit *op) {
auto child_num_rows =
memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows();

output_cost_ =
std::min((size_t)child_num_rows, (size_t)op->limit) * DEFAULT_TUPLE_COST;
}
void Visit(UNUSED_ATTRIBUTE const PhysicalInnerNLJoin *op) {
auto left_child_rows =
memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows();
auto right_child_rows =
memo_->GetGroupByID(gexpr_->GetChildGroupId(1))->GetNumRows();

output_cost_ = left_child_rows * right_child_rows * DEFAULT_TUPLE_COST;
}
void Visit(UNUSED_ATTRIBUTE const PhysicalLeftNLJoin *op) {}
void Visit(UNUSED_ATTRIBUTE const PhysicalRightNLJoin *op) {}
void Visit(UNUSED_ATTRIBUTE const PhysicalOuterNLJoin *op) {}
void Visit(UNUSED_ATTRIBUTE const PhysicalInnerHashJoin *op) {
auto left_child_rows =
memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows();
auto right_child_rows =
memo_->GetGroupByID(gexpr_->GetChildGroupId(1))->GetNumRows();
// TODO(boweic): Build (left) table should have different cost to probe table
output_cost_ = (left_child_rows + right_child_rows) * DEFAULT_TUPLE_COST;
}
void Visit(UNUSED_ATTRIBUTE const PhysicalLeftHashJoin *op) {}
void Visit(UNUSED_ATTRIBUTE const PhysicalRightHashJoin *op) {}
void Visit(UNUSED_ATTRIBUTE const PhysicalOuterHashJoin *op) {}
void Visit(UNUSED_ATTRIBUTE const PhysicalInsert *op) {}
void Visit(UNUSED_ATTRIBUTE const PhysicalInsertSelect *op) {}
void Visit(UNUSED_ATTRIBUTE const PhysicalDelete *op) {}
void Visit(UNUSED_ATTRIBUTE const PhysicalUpdate *op) {}
void Visit(UNUSED_ATTRIBUTE const PhysicalHashGroupBy *op) {
// TODO(boweic): Integrate hash in groupby may cause us to miss the
// opportunity to further optimize some query where the child output is
// already hashed by the GroupBy key, we'll do a hash anyway
output_cost_ = HashCost() + GroupByCost();
}
void Visit(UNUSED_ATTRIBUTE const PhysicalSortGroupBy *op) {
// Sort group by does not sort the tuples, it requires input columns to be
// sorted
output_cost_ = GroupByCost();
}
void Visit(UNUSED_ATTRIBUTE const PhysicalDistinct *op) {
output_cost_ = HashCost();
}
void Visit(UNUSED_ATTRIBUTE const PhysicalAggregate *op) {
// TODO(boweic): Ditto, separate groupby operator and implementation(e.g.
// hash, sort) may enable opportunity for further optimization
output_cost_ = HashCost() + GroupByCost();
}

private:

double HashCost() {
auto child_num_rows =
memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows();
// O(tuple)
return child_num_rows * DEFAULT_TUPLE_COST;
}

double SortCost() {
auto child_num_rows =
memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows();

if (child_num_rows == 0) {
return 1.0f;
}
// O(tuple * log(tuple))
return child_num_rows * std::log2(child_num_rows) * DEFAULT_TUPLE_COST;
}

double GroupByCost() {
auto child_num_rows =
memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows();
// O(tuple)
return child_num_rows * DEFAULT_TUPLE_COST;
}

GroupExpression *gexpr_;
Memo *memo_;
concurrency::TransactionContext *txn_;
double output_cost_ = 0;
};

} // namespace optimizer
} // namespace peloton
Loading