Shuf

/*
Given a data set of an unknown size,
Get a random sample in a random order
It's used in data analytics, often as a way to get a small random sample from a data lake or warehouse, or from a large CSV file
*/
function shuf (datasetSource, sampleSize) {
  const output = fillBaseSample(datasetSource, sampleSize)

  return randomizeOutputFromDataset(datasetSource, output)
}

/**
 * Fills the output if possible, with the minimum number of values
 * @param {Iterable.<T>} datasetSource The iterable source of data
 * @param {number} sampleSize The size of the sample to extract from the dataset
 * @returns {Array.<T>} The random sample, as an array
 * @template T
 */
function fillBaseSample (datasetSource, sampleSize) {
  let filledIndexes = []
  let output = new Array(sampleSize)

  // Spread data out filling the array
  while (true) {
    const iterator = datasetSource.next()
    if (iterator.done) break

    let insertTo = Math.floor(Math.random() * output.length)
    while (filledIndexes.includes(insertTo)) {
      insertTo++
      if (insertTo === output.length) {
        insertTo = 0
      }
    }
    output[insertTo] = {
      value: iterator.value
    }

    filledIndexes = [...filledIndexes, insertTo]

    if (filledIndexes.length === sampleSize) {
      break
    }
  }

  if (filledIndexes.length < output.length) {
    // Not a large enough dataset to fill the sample - trim empty values
    output = output.filter((_, i) => filledIndexes.includes(i))
  }

  return output.map((o) => o.value)
}

/**
 * Replaces values in the output randomly with new ones from the dataset
 * @param {Iterable.<T>} datasetSource The iterable source of data
 * @param {Array.<T>} output The output so far, filled with data
 * @returns {Array.<T>} The random sample, as an array
 * @template T
 */
function randomizeOutputFromDataset (datasetSource, output) {
  const newOutput = [...output]
  let readSoFar = output.length

  while (true) {
    const iterator = datasetSource.next()
    if (iterator.done) break
    readSoFar++

    const insertTo = Math.floor(Math.random() * readSoFar)
    if (insertTo < newOutput.length) {
      newOutput[insertTo] = iterator.value
    }
  }

  return newOutput
}

// Example

/**
 * Generates a random range of data, with values between 0 and 2^31 - 1
 * @param {number} length The number of data items to generate
 * @returns {Iterable<number>} Random iterable data
*/
function * generateRandomData (length) {
  const maxValue = Math.pow(2, 31) - 1
  for (let i = 0; i < length; i++) {
    yield Math.floor(Math.random() * maxValue)
  }
}

// const source = generateRandomData(1000)
// const result = shuf(source, 10)

export { shuf, generateRandomData }
Algerlogo

Β© Alger 2022

About us

We are a group of programmers helping each other build new things, whether it be writing complex encryption programs, or simple ciphers. Our goal is to work together to document and model beautiful, helpful and interesting algorithms using code. We are an open-source community - anyone can contribute. We check each other's work, communicate and collaborate to solve problems. We strive to be welcoming, respectful, yet make sure that our code follows the latest programming guidelines.