A platform for high-performance distributed tool and library development written in C++. It can be deployed in two different cluster modes: standalone or distributed. API for v0.5.0, released on June 13, 2018.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
Sampler.h
Go to the documentation of this file.
1 /*****************************************************************************
2  * *
3  * Copyright 2018 Rice University *
4  * *
5  * Licensed under the Apache License, Version 2.0 (the "License"); *
6  * you may not use this file except in compliance with the License. *
7  * You may obtain a copy of the License at *
8  * *
9  * http://www.apache.org/licenses/LICENSE-2.0 *
10  * *
11  * Unless required by applicable law or agreed to in writing, software *
12  * distributed under the License is distributed on an "AS IS" BASIS, *
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
14  * See the License for the specific language governing permissions and *
15  * limitations under the License. *
16  * *
17  *****************************************************************************/
18 #ifndef PDB_SAMPLER_H
19 #define PDB_SAMPLER_H
20 
21 #include "KMeansDoubleVector.h"
22 #include <math.h>
23 #include <stdio.h>
24 
25 /* This class wraps utilities for sampling data */
26 namespace pdb {
27 
28 class Sampler {
29 
30 public:
31  static double numStd(int sampleSizeLowerBound) {
32  // to make it tight
33  if (sampleSizeLowerBound < 6.0) {
34  return 12.0;
35  } else if (sampleSizeLowerBound < 16.0) {
36  return 9.0;
37  } else {
38  return 6.0;
39  }
40  }
41 
42  static double computeFractionForSampleSize(int sampleSizeLowerBound,
43  long total, bool withReplacement) {
44 
45  if (withReplacement) {
46  return fmax(sampleSizeLowerBound +
47  numStd(sampleSizeLowerBound) * sqrt(sampleSizeLowerBound),
48  1e-15) /
49  total;
50  } else {
51  double fraction = (double)sampleSizeLowerBound / (double)(total);
52  double delata = 1e-4;
53  double gamma = -log(delata) / total;
54  return fmin(1,
55  fmax(1e-10, fraction + gamma +
56  sqrt(gamma * gamma + 2 * gamma * fraction)));
57  }
58  }
59 
60  // srand must be initialized before invoking below function
61  static void
63 
64  size_t mySize = samples.size();
65  for (int i = mySize - 1; i >= 0; i--) {
66  int j = rand() % (i + 1);
67  Handle<KMeansDoubleVector> tmp = samples[j];
68  samples[j] = samples[i];
69  samples[i] = tmp;
70  }
71  }
72 
73  // srand must be initialized before invoking below function
75 
76  size_t mySize = samples.size();
77  for (int i = mySize - 1; i >= 0; i--) {
78  int j = rand() % (i + 1);
79  Handle<KMeansDoubleVector> tmp = samples[j];
80  samples[j] = samples[i];
81  samples[i] = tmp;
82  }
83  }
84 };
85 }
86 
87 #endif
static double numStd(int sampleSizeLowerBound)
Definition: Sampler.h:31
static void randomizeInPlace(Vector< Handle< KMeansDoubleVector >> &samples)
Definition: Sampler.h:74
static double computeFractionForSampleSize(int sampleSizeLowerBound, long total, bool withReplacement)
Definition: Sampler.h:42
static void randomizeInPlace(std::vector< Handle< KMeansDoubleVector >> &samples)
Definition: Sampler.h:62