Map vs Vector

2020-07-15

How much faster is a map over a vector of pairs for lookups, and for what input sizes? What if the vector is sorted? Here is a quick benchmark to find out and the results on my machine.

Implementation

The implementation builds an unsorted vector from a random permutation, a sorted vector, and a map, all of a given input size. It then performs a number of lookups on each, using a linear scan for the unsorted vector and binary search for the sorted one, and measures the overall time it takes to perform all these lookups. A more interesting benchmark could look at the distribution of lookup times for each data structure and print some useful statistics.

#include <algorithm>
#include <chrono>
#include <functional>
#include <iostream>
#include <map>
#include <random>
#include <string>
#include <vector>

// Create a test map of size N.
std::map<int, int> make_map(int N) {
  std::map<int, int> map;
  for (int i = 0; i < N; ++i) {
    map[i] = i;
  }
  return map;
}

// Create a sorted test vector of size N.
std::vector<std::pair<int, int>> make_sorted_vector(int N) {
  std::vector<std::pair<int, int>> vector(N);
  for (int i = 0; i < N; ++i) {
    vector[i] = std::make_pair(i, i);
  }
  return vector;
}

// Create an unsorted test vector of size N.
std::vector<std::pair<int, int>> make_unsorted_vector(int N) {
  std::vector<std::pair<int, int>> vector = make_sorted_vector(N);
  std::random_shuffle(vector.begin(), vector.end());
  return vector;
}

// Run the given lookup function many times.
// Return the sum of the lookup function results.
int run_lookup(int seed, int min, int max, int times,
               const std::function<int(int)> &lookup) {
  std::uniform_int_distribution<int> uniform(min, max);
  std::minstd_rand rnd(seed);
  int found = 0;
  for (int i = 0; i < times; ++i) {
    int key = uniform(rnd);
    found += lookup(key);
  }
  return found;
}

// Benchmark the given function.
void benchmark(const std::string &name, const std::function<void()> &func) {
  using clock = std::chrono::high_resolution_clock;
  auto start = clock::now();
  func();
  auto end = clock::now();
  auto elapsed_ns =
      std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);
  double elapsed_ms = elapsed_ns.count() / 1000.0 / 1000.0;
  std::cout << name << " took " << elapsed_ms << " ms" << std::endl;
}

// Benchmark map and vector lookups.
void run_benchmark(int size) {
  constexpr int NUM_LOOKUPS = 1000 * 1000;
  constexpr int SEED = 123;

  std::cout << std::endl << "Benchmark: Size = " << size << std::endl;

  const std::map<int, int> map = make_map(size);
  const std::vector<std::pair<int, int>> unsorted_vector =
      make_unsorted_vector(size);
  const std::vector<std::pair<int, int>> sorted_vector =
      make_sorted_vector(size);

  const std::function<int(int)> map_lookup = [&](int key) {
    return map.find(key) != map.end() ? 1 : 0;
  };

  const std::function<int(int)> sorted_vector_lookup = [&](int key) {
    return std::binary_search(sorted_vector.begin(), sorted_vector.end(),
                              std::make_pair(key, key))
               ? 1
               : 0;
  };

  const std::function<int(int)> unsorted_vector_lookup = [&](int key) {
    return std::find_if(unsorted_vector.begin(), unsorted_vector.end(),
                        [&](const auto &pair) { return pair.first == key; }) !=
                   unsorted_vector.end()
               ? 1
               : 0;
  };

  int map_found = 0;
  int sorted_vector_found = 0;
  int unsorted_vector_found = 0;

  benchmark("map", [&]() {
    map_found = run_lookup(SEED, 0, size - 1, NUM_LOOKUPS, map_lookup);
  });

  benchmark("sorted vector", [&]() {
    sorted_vector_found =
        run_lookup(SEED, 0, size - 1, NUM_LOOKUPS, sorted_vector_lookup);
  });

  benchmark("unsorted vector", [&]() {
    unsorted_vector_found =
        run_lookup(SEED, 0, size - 1, NUM_LOOKUPS, unsorted_vector_lookup);
  });

  // Not counting or doing anything with the lookup results also allows the
  // compiler to optimize away too much code, so this check serves two purposes.
  if (!(map_found == unsorted_vector_found &&
        map_found == sorted_vector_found)) {
    std::cout << "Oops, something went wrong, not all lookup counts match."
              << std::endl;
    std::cout << "  Map: " << map_found << std::endl;
    std::cout << "  Unsorted vector: " << unsorted_vector_found << std::endl;
    std::cout << "  Sorted vector: " << sorted_vector_found << std::endl;
  }
}

int main() {
  std::cout << "Map vs Vector lookup benchmark" << std::endl;
  std::cout << "------------------------------" << std::endl;

  for (int size = 10; size < 1000 * 1000; size *= 10) {
    run_benchmark(size);
  }

  return 0;
}

Results

A linear scan over an unsorted vector seems to work best for small sizes. The map and a binary search on the sorted vector then take over, with the map yielding to the sorted vector for larger sizes.

Map vs Vector lookup benchmark
------------------------------

Benchmark: Size = 10
map took 44.6243 ms
sorted vector took 47.8598 ms
unsorted vector took 37.6925 ms

Benchmark: Size = 100
map took 59.5871 ms
sorted vector took 70.4616 ms
unsorted vector took 49.0126 ms

Benchmark: Size = 1000
map took 81.0057 ms
sorted vector took 89.1831 ms
unsorted vector took 148.318 ms

Benchmark: Size = 10000
map took 139.211 ms
sorted vector took 107.852 ms
unsorted vector took 1284.97 ms

Benchmark: Size = 100000
map took 252.725 ms
sorted vector took 145.098 ms
unsorted vector took 14211.6 ms