d5/da5/_hash_partition_sink_8h_source.html

 /*****************************************************************************

  *                                                                           *

  *  Copyright 2018 Rice University                                           *

  *                                                                           *

  *  Licensed under the Apache License, Version 2.0 (the "License");          *

  *  you may not use this file except in compliance with the License.         *

  *  You may obtain a copy of the License at                                  *

  *                                                                           *

  *      http://www.apache.org/licenses/LICENSE-2.0                           *

  *                                                                           *

  *  Unless required by applicable law or agreed to in writing, software      *

  *  distributed under the License is distributed on an "AS IS" BASIS,        *

  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *

  *  See the License for the specific language governing permissions and      *

  *  limitations under the License.                                           *

  *                                                                           *

  *****************************************************************************/


 #ifndef HASH_PARTITION_SINK_H

 #define HASH_PARTITION_SINK_H


 #include "ComputeSink.h"

 #include "TupleSetMachine.h"

 #include "TupleSet.h"

 #include "DataTypes.h"

 #include <vector>


 namespace pdb {


 // runs hashes all of the tuples, and stores all tuples to a container that is partitioned

 // by node partitions.

 template <class KeyType, class ValueType>

 class HashPartitionSink : public ComputeSink {


 public:


     HashPartitionSink(int numPartitions, int numNodes, TupleSpec& inputSchema, TupleSpec& attsToOperateOn) {


         // to setup the output tuple set

         TupleSpec empty;

         TupleSetSetupMachine myMachine(inputSchema, empty);


         // this is the input attribute that we will process

         std::vector<int> matches = myMachine.match(attsToOperateOn);

         whichAttToStore = matches[0];

         whichAttToHash = matches[1];

         std::cout << "whichAttToStore=" << whichAttToStore << std::endl;

         std::cout << "whichAttToHash=" << whichAttToHash << std::endl;

         this->numPartitions = numPartitions;

         this->numNodes = numNodes;

         std::cout << "numPartitions=" << numPartitions << std::endl;

         std::cout << "numNodes=" << numNodes << std::endl;

     }


     Handle<Object> createNewOutputContainer() override {


         // we create a node-partitioned vector to store the output

         Handle<Vector<Handle<Vector<Handle<ValueType>>>>> returnVal =

             makeObject<Vector<Handle<Vector<Handle<ValueType>>>>>(numNodes);

         for (int i = 0; i < numNodes; i++) {

             Handle<Vector<Handle<ValueType>>> curNodeVec

                 = makeObject<Vector<Handle<ValueType>>>();

             returnVal->push_back(curNodeVec);

         }

         return returnVal;

     }


     void writeOut(TupleSetPtr input, Handle<Object>& writeToMe) override {


         // get the partitioned vector we are adding to

         Handle<Vector<Handle<Vector<Handle<ValueType>>>>> writeMe =

             unsafeCast<Vector<Handle<Vector<Handle<ValueType>>>>>(writeToMe);

         size_t hashVal;


         // get the key columns

         std::vector<KeyType>& keyColumn = input->getColumn<KeyType>(whichAttToHash);


         // get the value columns

         std::vector<Handle<ValueType>>& valueColumn = input->getColumn<Handle<ValueType>>(whichAttToStore);


         // and allocate everyone to a partition

         size_t length = keyColumn.size();

         for (size_t i = 0; i < length; i++) {


             hashVal = Hasher<KeyType>::hash(keyColumn[i]);

             int nodeId = (hashVal % (numPartitions))/(numPartitions/numNodes);

             Vector<Handle<ValueType>>& myVec = *((*writeMe)[nodeId]);


             try {

                 //to add the value to the partition

                 myVec.push_back(valueColumn[i]);


             } catch (NotEnoughSpace & n) {


                 /* if we got here then we run out of space and we need delete the already-processed

                  *  data, throw an exception so that new space can be allocated by handling the exception,

                  *  and try to process the remaining unprocessed data again */

                 keyColumn.erase(keyColumn.begin(), keyColumn.begin() + i);

                 valueColumn.erase(valueColumn.begin(), valueColumn.begin() + i);

                 throw n;


             }

         }

     }


     ~HashPartitionSink() {}


 private:

     // the attribute to operate on

     int whichAttToHash;


     // the attribute to store

     int whichAttToStore;


     // number of partitions in the cluster

     int numPartitions;


     // number of nodes in the cluster

     int numNodes;


 };

 }


 #endif

pdb::ComputeSink
Definition: ComputeSink.h:33

pdb::HashPartitionSink::createNewOutputContainer
Handle< Object > createNewOutputContainer() override
Definition: HashPartitionSink.h:67

ComputeSink.h

pdb::HashPartitionSink
Definition: HashPartitionSink.h:34

pdb::HashPartitionSink::whichAttToStore
int whichAttToStore
Definition: HashPartitionSink.h:134

TupleSpec
Definition: TupleSpec.h:34

pdb::HashPartitionSink::writeOut
void writeOut(TupleSetPtr input, Handle< Object > &writeToMe) override
Definition: HashPartitionSink.h:87

pdb::TupleSetSetupMachine::match
std::vector< int > match(TupleSpec &attsToMatch)
Definition: TupleSetMachine.h:43

pdb::NotEnoughSpace
Definition: Allocator.h:389

pdb::HashPartitionSink::~HashPartitionSink
~HashPartitionSink()
Definition: HashPartitionSink.h:126

DataTypes.h

pdb::Vector
Definition: PDBVector.h:41

pdb::HashPartitionSink::HashPartitionSink
HashPartitionSink(int numPartitions, int numNodes, TupleSpec &inputSchema, TupleSpec &attsToOperateOn)
Definition: HashPartitionSink.h:45

pdb::TupleSetPtr
std::shared_ptr< TupleSet > TupleSetPtr
Definition: TupleSet.h:64

pdb::HashPartitionSink::whichAttToHash
int whichAttToHash
Definition: HashPartitionSink.h:131

pdb::HashPartitionSink::numNodes
int numNodes
Definition: HashPartitionSink.h:140

TupleSetMachine.h

pdb::Hasher::hash
static auto hash(const KeyType &k) -> decltype(hash_impl(k, 0))
Definition: PairArray.cc:85

pdb::Vector::push_back
void push_back(const TypeContained &val)
Definition: PDBVector.cc:95

TupleSet.h

pdb::HashPartitionSink::numPartitions
int numPartitions
Definition: HashPartitionSink.h:137

pdb::TupleSetSetupMachine
Definition: TupleSetMachine.h:24

pdb::Handle
Definition: Allocator.h:399