Streaming SIMD Extensions 2 Instructions
  How to Write Fast Numerical Code
  C++ - Getting started with SSE
  Stackoverflow
Introduction
  - SSE stands for Streaming SIMD Extensions. It is a set of CPU instructions dedicated to applications like signal processing, scientific computation or 3D graphics.
- SIMD is an acronym itself: Single Instruction, Multiple Data. A CPU instruction is said to be SIMD when the same operation is applied on multiple data at the same time.
- Naming convention: _mm_<intrin_op>_<suffix>
// a is 16-byte aligned
float a[4] = {1.0, 2.0, 3.0, 4.0}; 
__m128 t = _mm_load_ps(a); // p: packed, s: single precision
Issues
  - Alignment is important (128 bit = 16 byte)
- You need to code explicit loads and stores
- Overhead through shuffles
Example
  - _mm_store_psstores results in an array.
- _mm_sqrt_pscomputes the square root of 4 float in a single operation.
- posix_memalignallocates data on the stack;
- aligned (alignment)allocates aligned data on the heap. (This attribute specifies a minimum alignment for the variable or structure field, measured in bytes.) e.g.,- float a[] __attribute__ ((aligned (16))) = { 41982.,  81.5091, 3.14, 42.666 };
#include <emmintrin.h> // IMPORTANT
#include <math.h>
#include <stdio.h>
#include <chrono>
#include <iostream>
using namespace std;
class Timer {
 public:
  Timer() { start = Clock::now(); }
  void Start() { start = Clock::now(); }
  void End() {
    auto end = Clock::now();
    std::cout << "Elapsed time: "
              << std::chrono::duration_cast<std::chrono::nanoseconds>(end -
                                                                      start)
                     .count()
              << " nanoseconds" << std::endl;
  }
 private:
  using Clock = std::chrono::high_resolution_clock;
  std::chrono::_V2::system_clock::time_point start;
};
void normal(float* a, int N) {
  for (int i = 0; i < N; ++i) {
    a[i] = sqrt(a[i]);
  }
}
// compute the square root of a very large array on float
void sse(float* a, int N) {
  // We assume N % 4 == 0.
  int nb_iters = N / 4;
  __m128* ptr = (__m128*)a;
  for (int i = 0; i < nb_iters; ++i, ++ptr, a += 4) {
    _mm_store_ps(a, _mm_sqrt_ps(*ptr));
  }
}
// add two arrays of char together
void sse(char* a, const char* b, int N) {
  int nb_iters = N / 16;
  __m128i* l = (__m128i*)a;
  __m128i* r = (__m128i*)b;
  for (int i = 0; i < nb_iters; ++i, ++l, ++r)
    _mm_store_si128(l, _mm_add_epi8(*l, *r));
}
int main(int argc, char** argv) {
  if (argc != 2) {
    cout << "Usage: ./sse_test N (N represents the number of floats)" << endl;
    return 1;
  }
  int N = atoi(argv[1]);
  float* a;
  posix_memalign((void**)&a, 16, N * sizeof(float));
  for (int i = 0; i < N; ++i) {
    a[i] = 3141592.65358;
  }
  Timer timer;
  {
    timer.Start();
    normal(a, N);
    timer.End();
  }
  for (int i = 0; i < N; ++i) {
    a[i] = 3141592.65358;
  }
  {
    timer.Start();
    sse(a, N);
    timer.End();
  }
}
g++ -o sse_test sse_test.cc -std=c++11 -O3 -msse2
./sse_test 64000000
Elapsed time: 336269639 nanoseconds
Elapsed time: 29902925 nanoseconds