Skip to content
Snippets Groups Projects
Commit 3418afa4 authored by Christian Dietrich's avatar Christian Dietrich
Browse files

Dec 19 -- Cookie Counting

Article:  https://ibr.cs.tu-bs.de/advent/19-perf/
Workload: ~68 source-code lines
parent 1a7ccb7f
No related branches found
No related tags found
No related merge requests found
PROG = perf
${PROG}: ${PROG}.c matrix.c
gcc $< -o $@ -Wall -g -O3
run: ${PROG}
./${PROG} 512
strace: ${PROG}
strace ./${PROG} 512
clean:
rm -f ./${PROG}
double *create_matrix(unsigned dim) {
double *ret = calloc(dim * dim, sizeof(double));
if (!ret) die("calloc");
return ret;
}
double *create_random_matrix(unsigned dim) {
double * ret = create_matrix(dim);
for (unsigned i = 0; i < dim * dim; i ++) {
ret[i] = random() / (double) INT_MAX;
}
return ret;
}
void matrixmul_drepper(unsigned N, double *A, double *B, double *C) {
unsigned CLS = 64;
unsigned SM = CLS / sizeof(double);
int i, i2, j, j2, k, k2;
double *restrict rres;
double *restrict rmul1;
double *restrict rmul2;
for (i = 0; i < N; i += SM)
for (j = 0; j < N; j += SM)
for (k = 0; k < N; k += SM)
for (i2 = 0, rres = &C[i * N + j], rmul1 = &A[i * N + k]; i2 < SM;
++i2, rres += N, rmul1 += N)
{
_mm_prefetch (&rmul1[8], _MM_HINT_NTA);
for (k2 = 0, rmul2 = &B[k * N + j]; k2 < SM; ++k2, rmul2 += N)
{
__m128d m1d = _mm_load_sd (&rmul1[k2]);
m1d = _mm_unpacklo_pd (m1d, m1d);
for (j2 = 0; j2 < SM; j2 += 2)
{
__m128d m2 = _mm_load_pd (&rmul2[j2]);
__m128d r2 = _mm_load_pd (&rres[j2]);
_mm_store_pd (&rres[j2],
_mm_add_pd (_mm_mul_pd (m2, m1d), r2));
}
}
}
}
void matrixmul_naive(unsigned N, double *A, double *B, double *C) {
unsigned i, j, k;
for (i = 0; i < N; ++i)
for (j = 0; j < N; ++j) {
double sum = 0;
for (k = 0; k < N; ++k)
sum += A[i *N + k] * B[k * N + j];
C[i * N + j] = sum;
}
}
#define _GNU_SOURCE
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <string.h>
#include <sys/ioctl.h>
#include <linux/perf_event.h>
#include <linux/hw_breakpoint.h>
#include <asm/unistd.h>
#include <errno.h>
#include <stdint.h>
#include <inttypes.h>
#include <emmintrin.h>
#include <limits.h>
#define ARRAY_SIZE(arr) (sizeof(arr)/sizeof(*(arr)))
#define die(msg) do { perror(msg); exit(EXIT_FAILURE); } while(0)
// Matrix Multiplication Functions
#include "matrix.c"
// When reading from the perf descriptor, the kernel returns an
// event record in the following format (if PERF_FORMAT_GROUP |
// PERF_FORMAT_ID are enabled).
// Example (with id0=100, id1=200): {.nr = 2, .values = {{41433, 200}, {42342314, 100}}}
typedef uint64_t perf_event_id; // For readability only
struct read_format {
uint64_t nr;
struct {
uint64_t value;
perf_event_id id; // PERF_FORMAT_ID
} values[];
};
// Structure to hold a perf group
struct perf_handle {
int group_fd; // First perf_event fd that we create
int nevents; // Number of registered events
size_t rf_size; // How large is the read_format buffer (derived from nevents)
struct read_format *rf; // heap-allocated buffer for the read event
};
// Syscall wrapper for perf_event_open(2), as glibc does not have one
int sys_perf_event_open(struct perf_event_attr *attr,
pid_t pid, int cpu, int group_fd,
unsigned long flags) {
return syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags);
}
// Add a perf event of given (type, config) with default config. If p
// is not yet initialized (p->group_fd <=0), the perf_event becomes
// the group leader. The function returns an id that can be used in
// combination with perf_event_get.
perf_event_id perf_event_add(struct perf_handle *p, int type, int config) {
// FIXME: Create event with perf_event_open
// FIXME: Get perf_event_id with PERF_EVENT_IOC_ID
return -1;
}
// Resets and starts the perf measurement
void perf_event_start(struct perf_handle *p) {
// FIXME: PERF_EVENT_IOC_{RESET, ENABLE}
}
// Stops the perf measurement and reads out the event
void perf_event_stop(struct perf_handle *p) {
// FIXME: PERF_EVENT_IOC_DISABLE
// FIXME: Read event from the group_fd into an allocated buffer
}
// After the measurement, this helper extracts the event counter for
// the given perf_event_id (which was returned by perf_event_add)
uint64_t perf_event_get(struct perf_handle *p, perf_event_id id) {
return -1;
}
int main(int argc, char* argv[]) {
unsigned dim = 32;
if (argc > 1) {
dim = atoi(argv[1]);
}
if ((dim & (dim - 1)) != 0) {
fprintf(stderr, "Given dimension must be a power of two\n");
exit(EXIT_FAILURE);
}
// Create some matrices
double *A = create_random_matrix(dim);
double *B = create_random_matrix(dim);
double *C0 = create_matrix(dim);
double *C1 = create_matrix(dim);
size_t msize = sizeof(double) * dim * dim;
printf("matrix_size: %.2f MiB\n", msize / (1024.0 * 1024.0));
// We provide you with two matrix multiply implementations (see
// matrix.c). The optimized variant was created by Ulrich Drepper
// and is optimized for cache usage. For a detailed discussion,
// we refer you to
//
// "What Every Programmer Should Know About Memory", Ulrich Drepper, 2007,
// https://www.akkadia.org/drepper/cpumemory.pdf
matrixmul_drepper(dim, A, B, C0);
matrixmul_naive(dim, A, B, C1);
// Sanity Checking: are both result matrices equal (with a margin of 0.1%) ?
for (unsigned i = 0; i < (dim*dim); i++) {
double delta = 1.0 - C1[i]/C0[i];
if (delta > 0.001 || delta < -0.001) {
fprintf(stderr, "mismatch at %d: %f%%\n", i, delta * 100);
return -1;
}
}
return 0;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment