Skip to content
Snippets Groups Projects
Commit 30e96363 authored by Christian Dietrich's avatar Christian Dietrich
Browse files

Dec 23 -- Counting Presents. Fast!

Article:  https://ibr.cs.tu-bs.de/advent/23-rseq/
Workload: ~12 source-code lines
parent 37ca876a
No related branches found
Tags template_23
No related merge requests found
PROG = rseq
${PROG}: ${PROG}.o rseq-asm.o Makefile
gcc ${PROG}.o rseq-asm.o -o $@ -Wall -g -Wno-unused-function -O3 -lpthread
%.o: %.c
gcc -c $< -o $@ -Wall -g -Wno-unused-function -O3
%.o: %.S
gcc -c $< -o $@
run: ${PROG}
./${PROG} 32 regular
./${PROG} 32 lock
clean:
rm -f ./${PROG} *.o
man:
man ./rseq.2
plot:
./plot.py data
#!/bin/bash
rm data
for mode in rseq-atomic rseq regular lock getcpu-atomic ; do
for threads in `seq 1 32`; do
./rseq $threads $mode 8 >> data
done
done
echo "Run: ./plot.py data"
mode=rseq-atomic threads=1 sum=400000000 state=ok aborts=0 cputime=2.199005s per_increment=5.497513ns
mode=rseq-atomic threads=2 sum=800000000 state=ok aborts=0 cputime=3.284306s per_increment=4.105382ns
mode=rseq-atomic threads=3 sum=1200000000 state=ok aborts=0 cputime=4.448196s per_increment=3.706830ns
mode=rseq-atomic threads=4 sum=1600000000 state=ok aborts=0 cputime=8.446127s per_increment=5.278830ns
mode=rseq-atomic threads=5 sum=2000000000 state=ok aborts=0 cputime=9.053520s per_increment=4.526760ns
mode=rseq-atomic threads=6 sum=2400000000 state=ok aborts=0 cputime=12.195459s per_increment=5.081441ns
mode=rseq-atomic threads=7 sum=2800000000 state=ok aborts=0 cputime=11.748222s per_increment=4.195793ns
mode=rseq-atomic threads=8 sum=3200000000 state=ok aborts=0 cputime=15.032143s per_increment=4.697545ns
mode=rseq-atomic threads=9 sum=3600000000 state=ok aborts=0 cputime=16.116472s per_increment=4.476798ns
mode=rseq-atomic threads=10 sum=4000000000 state=ok aborts=0 cputime=20.672133s per_increment=5.168033ns
mode=rseq-atomic threads=11 sum=4400000000 state=ok aborts=0 cputime=23.798913s per_increment=5.408844ns
mode=rseq-atomic threads=12 sum=4800000000 state=ok aborts=0 cputime=26.946868s per_increment=5.613931ns
mode=rseq-atomic threads=13 sum=5200000000 state=ok aborts=0 cputime=29.666975s per_increment=5.705187ns
mode=rseq-atomic threads=14 sum=5600000000 state=ok aborts=0 cputime=33.483891s per_increment=5.979266ns
mode=rseq-atomic threads=15 sum=6000000000 state=ok aborts=0 cputime=37.403459s per_increment=6.233910ns
mode=rseq-atomic threads=16 sum=6400000000 state=ok aborts=0 cputime=40.505500s per_increment=6.328984ns
mode=rseq-atomic threads=17 sum=6800000000 state=ok aborts=0 cputime=42.792953s per_increment=6.293081ns
mode=rseq-atomic threads=18 sum=7200000000 state=ok aborts=0 cputime=45.171370s per_increment=6.273801ns
mode=rseq-atomic threads=19 sum=7600000000 state=ok aborts=0 cputime=47.448919s per_increment=6.243279ns
mode=rseq-atomic threads=20 sum=8000000000 state=ok aborts=0 cputime=49.511613s per_increment=6.188952ns
mode=rseq-atomic threads=21 sum=8400000000 state=ok aborts=0 cputime=52.517915s per_increment=6.252133ns
mode=rseq-atomic threads=22 sum=8800000000 state=ok aborts=0 cputime=55.319380s per_increment=6.286293ns
mode=rseq-atomic threads=23 sum=9200000000 state=ok aborts=0 cputime=57.941721s per_increment=6.298013ns
mode=rseq-atomic threads=24 sum=9600000000 state=ok aborts=0 cputime=60.357880s per_increment=6.287279ns
mode=rseq-atomic threads=25 sum=10000000000 state=ok aborts=0 cputime=62.908038s per_increment=6.290804ns
mode=rseq-atomic threads=26 sum=10400000000 state=ok aborts=0 cputime=65.316597s per_increment=6.280442ns
mode=rseq-atomic threads=27 sum=10800000000 state=ok aborts=0 cputime=68.210331s per_increment=6.315771ns
mode=rseq-atomic threads=28 sum=11200000000 state=ok aborts=0 cputime=71.145485s per_increment=6.352275ns
mode=rseq-atomic threads=29 sum=11600000000 state=ok aborts=0 cputime=74.014644s per_increment=6.380573ns
mode=rseq-atomic threads=30 sum=12000000000 state=ok aborts=0 cputime=80.516616s per_increment=6.709718ns
mode=rseq-atomic threads=31 sum=12400000000 state=ok aborts=0 cputime=78.501505s per_increment=6.330767ns
mode=rseq-atomic threads=32 sum=12800000000 state=ok aborts=0 cputime=81.190299s per_increment=6.342992ns
mode=rseq threads=1 sum=400000000 state=ok aborts=0 cputime=2.483927s per_increment=6.209818ns
mode=rseq threads=2 sum=800000000 state=ok aborts=1 cputime=2.927913s per_increment=3.659891ns
mode=rseq threads=3 sum=1200000000 state=ok aborts=0 cputime=4.631658s per_increment=3.859715ns
mode=rseq threads=4 sum=1600000000 state=ok aborts=2 cputime=9.087390s per_increment=5.679619ns
mode=rseq threads=5 sum=2000000000 state=ok aborts=1 cputime=7.195553s per_increment=3.597776ns
mode=rseq threads=6 sum=2400000000 state=ok aborts=0 cputime=9.132219s per_increment=3.805091ns
mode=rseq threads=7 sum=2800000000 state=ok aborts=0 cputime=10.669462s per_increment=3.810522ns
mode=rseq threads=8 sum=3200000000 state=ok aborts=2 cputime=14.527707s per_increment=4.539909ns
mode=rseq threads=9 sum=3600000000 state=ok aborts=0 cputime=15.470640s per_increment=4.297400ns
mode=rseq threads=10 sum=4000000000 state=ok aborts=3 cputime=18.031536s per_increment=4.507884ns
mode=rseq threads=11 sum=4400000000 state=ok aborts=1 cputime=20.358866s per_increment=4.627015ns
mode=rseq threads=12 sum=4800000000 state=ok aborts=4 cputime=23.474872s per_increment=4.890598ns
mode=rseq threads=13 sum=5200000000 state=ok aborts=5 cputime=29.528908s per_increment=5.678636ns
mode=rseq threads=14 sum=5600000000 state=ok aborts=8 cputime=30.623857s per_increment=5.468546ns
mode=rseq threads=15 sum=6000000000 state=ok aborts=11 cputime=33.960129s per_increment=5.660021ns
mode=rseq threads=16 sum=6400000000 state=ok aborts=14 cputime=36.712009s per_increment=5.736251ns
mode=rseq threads=17 sum=6800000000 state=ok aborts=18 cputime=38.662291s per_increment=5.685631ns
mode=rseq threads=18 sum=7200000000 state=ok aborts=22 cputime=40.966459s per_increment=5.689786ns
mode=rseq threads=19 sum=7600000000 state=ok aborts=24 cputime=42.702562s per_increment=5.618758ns
mode=rseq threads=20 sum=8000000000 state=ok aborts=17 cputime=45.202435s per_increment=5.650304ns
mode=rseq threads=21 sum=8400000000 state=ok aborts=38 cputime=48.130558s per_increment=5.729828ns
mode=rseq threads=22 sum=8800000000 state=ok aborts=48 cputime=50.154982s per_increment=5.699430ns
mode=rseq threads=23 sum=9200000000 state=ok aborts=61 cputime=53.364385s per_increment=5.800477ns
mode=rseq threads=24 sum=9600000000 state=ok aborts=57 cputime=54.841167s per_increment=5.712622ns
mode=rseq threads=25 sum=10000000000 state=ok aborts=21 cputime=56.911656s per_increment=5.691166ns
mode=rseq threads=26 sum=10400000000 state=ok aborts=39 cputime=59.314805s per_increment=5.703347ns
mode=rseq threads=27 sum=10800000000 state=ok aborts=58 cputime=61.900339s per_increment=5.731513ns
mode=rseq threads=28 sum=11200000000 state=ok aborts=42 cputime=63.666270s per_increment=5.684488ns
mode=rseq threads=29 sum=11600000000 state=ok aborts=56 cputime=66.452641s per_increment=5.728676ns
mode=rseq threads=30 sum=12000000000 state=ok aborts=87 cputime=68.713723s per_increment=5.726144ns
mode=rseq threads=31 sum=12400000000 state=ok aborts=48 cputime=70.797445s per_increment=5.709471ns
mode=rseq threads=32 sum=12800000000 state=ok aborts=69 cputime=73.319152s per_increment=5.728059ns
mode=regular threads=1 sum=400000000 state=ok aborts=0 cputime=4.445797s per_increment=11.114493ns
mode=regular threads=2 sum=800000000 state=ok aborts=0 cputime=10.181069s per_increment=12.726336ns
mode=regular threads=3 sum=1200000000 state=ok aborts=0 cputime=15.207685s per_increment=12.673070ns
mode=regular threads=4 sum=1600000000 state=ok aborts=0 cputime=20.394730s per_increment=12.746707ns
mode=regular threads=5 sum=2000000000 state=ok aborts=0 cputime=25.606271s per_increment=12.803136ns
mode=regular threads=6 sum=2400000000 state=ok aborts=0 cputime=30.550001s per_increment=12.729167ns
mode=regular threads=7 sum=2800000000 state=ok aborts=0 cputime=31.396464s per_increment=11.213023ns
mode=regular threads=8 sum=3200000000 state=ok aborts=0 cputime=40.884241s per_increment=12.776325ns
mode=regular threads=9 sum=3600000000 state=ok aborts=0 cputime=48.727290s per_increment=13.535358ns
mode=regular threads=10 sum=4000000000 state=ok aborts=0 cputime=57.116025s per_increment=14.279006ns
mode=regular threads=11 sum=4400000000 state=ok aborts=0 cputime=65.007191s per_increment=14.774362ns
mode=regular threads=12 sum=4800000000 state=ok aborts=0 cputime=72.886369s per_increment=15.184660ns
mode=regular threads=13 sum=5200000000 state=ok aborts=0 cputime=82.631048s per_increment=15.890586ns
mode=regular threads=14 sum=5600000000 state=ok aborts=0 cputime=83.803839s per_increment=14.964971ns
mode=regular threads=15 sum=6000000000 state=ok aborts=0 cputime=101.826165s per_increment=16.971027ns
mode=regular threads=16 sum=6399999999 state=fail aborts=0 cputime=103.622771s per_increment=16.191058ns
mode=regular threads=17 sum=6799999980 state=fail aborts=0 cputime=117.516558s per_increment=17.281847ns
mode=regular threads=18 sum=7199999984 state=fail aborts=0 cputime=124.609096s per_increment=17.306819ns
mode=regular threads=19 sum=7599999959 state=fail aborts=0 cputime=131.441945s per_increment=17.294993ns
mode=regular threads=20 sum=7999999972 state=fail aborts=0 cputime=127.853660s per_increment=15.981708ns
mode=regular threads=21 sum=8399999947 state=fail aborts=0 cputime=145.638661s per_increment=17.337936ns
mode=regular threads=22 sum=8799999943 state=fail aborts=0 cputime=152.054029s per_increment=17.278867ns
mode=regular threads=23 sum=9199999933 state=fail aborts=0 cputime=159.989756s per_increment=17.390191ns
mode=regular threads=24 sum=9599999946 state=fail aborts=0 cputime=167.602853s per_increment=17.458631ns
mode=regular threads=25 sum=9999999946 state=fail aborts=0 cputime=173.688899s per_increment=17.368890ns
mode=regular threads=26 sum=10399999956 state=fail aborts=0 cputime=180.728688s per_increment=17.377759ns
mode=regular threads=27 sum=10799999959 state=fail aborts=0 cputime=187.942425s per_increment=17.402076ns
mode=regular threads=28 sum=11199999958 state=fail aborts=0 cputime=194.921763s per_increment=17.403729ns
mode=regular threads=29 sum=11599999957 state=fail aborts=0 cputime=201.910554s per_increment=17.406082ns
mode=regular threads=30 sum=11999999966 state=fail aborts=0 cputime=208.667108s per_increment=17.388926ns
mode=regular threads=31 sum=12399999964 state=fail aborts=0 cputime=218.638442s per_increment=17.632132ns
mode=regular threads=32 sum=12799999939 state=fail aborts=0 cputime=222.786872s per_increment=17.405224ns
mode=lock threads=1 sum=400000000 state=ok aborts=0 cputime=9.739219s per_increment=24.348048ns
mode=lock threads=2 sum=800000000 state=ok aborts=0 cputime=19.477206s per_increment=24.346508ns
mode=lock threads=3 sum=1200000000 state=ok aborts=0 cputime=29.639653s per_increment=24.699711ns
mode=lock threads=4 sum=1600000000 state=ok aborts=0 cputime=39.447133s per_increment=24.654458ns
mode=lock threads=5 sum=2000000000 state=ok aborts=0 cputime=49.416199s per_increment=24.708099ns
mode=lock threads=6 sum=2400000000 state=ok aborts=0 cputime=58.912073s per_increment=24.546697ns
mode=lock threads=7 sum=2800000000 state=ok aborts=0 cputime=69.373970s per_increment=24.776418ns
mode=lock threads=8 sum=3200000000 state=ok aborts=0 cputime=80.045758s per_increment=25.014299ns
mode=lock threads=9 sum=3600000000 state=ok aborts=0 cputime=95.506874s per_increment=26.529687ns
mode=lock threads=10 sum=4000000000 state=ok aborts=0 cputime=105.647129s per_increment=26.411782ns
mode=lock threads=11 sum=4400000000 state=ok aborts=0 cputime=119.966161s per_increment=27.265037ns
mode=lock threads=12 sum=4800000000 state=ok aborts=0 cputime=144.790100s per_increment=30.164604ns
mode=lock threads=13 sum=5200000000 state=ok aborts=0 cputime=169.275312s per_increment=32.552945ns
mode=lock threads=14 sum=5600000000 state=ok aborts=0 cputime=184.008284s per_increment=32.858622ns
mode=lock threads=15 sum=6000000000 state=ok aborts=0 cputime=206.639565s per_increment=34.439927ns
mode=lock threads=16 sum=6400000000 state=ok aborts=0 cputime=220.186430s per_increment=34.404130ns
mode=lock threads=17 sum=6800000000 state=ok aborts=0 cputime=243.215361s per_increment=35.766965ns
mode=lock threads=18 sum=7200000000 state=ok aborts=0 cputime=256.791891s per_increment=35.665540ns
mode=lock threads=19 sum=7600000000 state=ok aborts=0 cputime=270.841824s per_increment=35.637082ns
mode=lock threads=20 sum=8000000000 state=ok aborts=0 cputime=286.147054s per_increment=35.768382ns
mode=lock threads=21 sum=8400000000 state=ok aborts=0 cputime=303.063539s per_increment=36.078993ns
mode=lock threads=22 sum=8800000000 state=ok aborts=0 cputime=334.728624s per_increment=38.037344ns
mode=lock threads=23 sum=9200000000 state=ok aborts=0 cputime=365.422783s per_increment=39.719868ns
mode=lock threads=24 sum=9600000000 state=ok aborts=0 cputime=383.564521s per_increment=39.954638ns
mode=lock threads=25 sum=10000000000 state=ok aborts=0 cputime=400.088520s per_increment=40.008852ns
mode=lock threads=26 sum=10400000000 state=ok aborts=0 cputime=413.845634s per_increment=39.792849ns
mode=lock threads=27 sum=10800000000 state=ok aborts=0 cputime=432.694245s per_increment=40.064282ns
mode=lock threads=28 sum=11200000000 state=ok aborts=0 cputime=448.437452s per_increment=40.039058ns
mode=lock threads=29 sum=11600000000 state=ok aborts=0 cputime=479.398819s per_increment=41.327484ns
mode=lock threads=30 sum=12000000000 state=ok aborts=0 cputime=485.327195s per_increment=40.443933ns
mode=lock threads=31 sum=12400000000 state=ok aborts=0 cputime=498.999020s per_increment=40.241856ns
mode=lock threads=32 sum=12800000000 state=ok aborts=0 cputime=518.804851s per_increment=40.531629ns
mode=getcpu-atomic threads=1 sum=400000000 state=ok aborts=0 cputime=4.590758s per_increment=11.476895ns
mode=getcpu-atomic threads=2 sum=800000000 state=ok aborts=0 cputime=10.373209s per_increment=12.966511ns
mode=getcpu-atomic threads=3 sum=1200000000 state=ok aborts=0 cputime=15.454717s per_increment=12.878931ns
mode=getcpu-atomic threads=4 sum=1600000000 state=ok aborts=0 cputime=18.276674s per_increment=11.422921ns
mode=getcpu-atomic threads=5 sum=2000000000 state=ok aborts=0 cputime=25.616891s per_increment=12.808445ns
mode=getcpu-atomic threads=6 sum=2400000000 state=ok aborts=0 cputime=30.558029s per_increment=12.732512ns
mode=getcpu-atomic threads=7 sum=2800000000 state=ok aborts=0 cputime=35.695183s per_increment=12.748280ns
mode=getcpu-atomic threads=8 sum=3200000000 state=ok aborts=0 cputime=41.202231s per_increment=12.875697ns
mode=getcpu-atomic threads=9 sum=3600000000 state=ok aborts=0 cputime=48.857853s per_increment=13.571626ns
mode=getcpu-atomic threads=10 sum=4000000000 state=ok aborts=0 cputime=57.148052s per_increment=14.287013ns
mode=getcpu-atomic threads=11 sum=4400000000 state=ok aborts=0 cputime=65.426838s per_increment=14.869736ns
mode=getcpu-atomic threads=12 sum=4800000000 state=ok aborts=0 cputime=74.198416s per_increment=15.458003ns
mode=getcpu-atomic threads=13 sum=5200000000 state=ok aborts=0 cputime=88.584374s per_increment=17.035457ns
mode=getcpu-atomic threads=14 sum=5600000000 state=ok aborts=0 cputime=97.566401s per_increment=17.422572ns
mode=getcpu-atomic threads=15 sum=6000000000 state=ok aborts=0 cputime=109.718405s per_increment=18.286401ns
mode=getcpu-atomic threads=16 sum=6400000000 state=ok aborts=0 cputime=122.916752s per_increment=19.205742ns
mode=getcpu-atomic threads=17 sum=6800000000 state=ok aborts=0 cputime=134.455988s per_increment=19.772939ns
mode=getcpu-atomic threads=18 sum=7200000000 state=ok aborts=0 cputime=140.480629s per_increment=19.511198ns
mode=getcpu-atomic threads=19 sum=7600000000 state=ok aborts=0 cputime=146.157256s per_increment=19.231218ns
mode=getcpu-atomic threads=20 sum=8000000000 state=ok aborts=0 cputime=155.691843s per_increment=19.461480ns
mode=getcpu-atomic threads=21 sum=8400000000 state=ok aborts=0 cputime=163.030484s per_increment=19.408391ns
mode=getcpu-atomic threads=22 sum=8800000000 state=ok aborts=0 cputime=170.584687s per_increment=19.384623ns
mode=getcpu-atomic threads=23 sum=9200000000 state=ok aborts=0 cputime=176.381227s per_increment=19.171873ns
mode=getcpu-atomic threads=24 sum=9600000000 state=ok aborts=0 cputime=185.423582s per_increment=19.314956ns
mode=getcpu-atomic threads=25 sum=10000000000 state=ok aborts=0 cputime=192.877967s per_increment=19.287797ns
mode=getcpu-atomic threads=26 sum=10400000000 state=ok aborts=0 cputime=199.917576s per_increment=19.222844ns
mode=getcpu-atomic threads=27 sum=10800000000 state=ok aborts=0 cputime=207.822609s per_increment=19.242834ns
mode=getcpu-atomic threads=28 sum=11200000000 state=ok aborts=0 cputime=216.261564s per_increment=19.309068ns
mode=getcpu-atomic threads=29 sum=11600000000 state=ok aborts=0 cputime=213.773868s per_increment=18.428782ns
mode=getcpu-atomic threads=30 sum=12000000000 state=ok aborts=0 cputime=221.522895s per_increment=18.460241ns
mode=getcpu-atomic threads=31 sum=12400000000 state=ok aborts=0 cputime=240.809263s per_increment=19.420102ns
mode=getcpu-atomic threads=32 sum=12800000000 state=ok aborts=0 cputime=251.500147s per_increment=19.648449ns
23-rseq/plot.Ryzen7_PRO_5850U.png

70.3 KiB

#!/usr/bin/python3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
def read_log(fn):
with open(fn) as fd:
lines = fd.readlines()
rows = []
for line in lines:
line = line.strip().split(" ")
header = [x.split("=")[0] for x in line]
data = [x.split("=")[1].rstrip("ns") for x in line]
rows.append(data)
df = pd.DataFrame(
columns=header,
data=rows)
for x in "threads sum aborts cputime per_increment".split():
df[x] = df[x].apply(float)
return df
if __name__ == "__main__":
import sys
if len(sys.argv) < 2:
sys.exit("usage: %s [LOGFILE]" % sys.argv[0])
df = read_log(sys.argv[1])
per_inc = df.set_index(["mode", "threads"]).per_increment.unstack().T
ax = per_inc.plot(marker='x', grid=True,figsize=(10,10))
ax.set_ylim((0, None))
ax.set_ylabel("Per Increment [ns]")
ax.get_figure().savefig('plot.png')
// int operation_rseq(struct rseq * rseq, struct cacheline *counters) {
// According to the Calling Convention the arguments come in registers:
// rseq: %rdi
// counter: %rsi
.p2align 4
.globl operation_rseq
.type operation_rseq, @function
operation_rseq:
.cfi_startproc
// We will return the number of aborts in %eax. Initialize
// eax with zero
xor %eax, %eax // %eax = 0
// We inform the kernel that we are now within a restartable
// sequence by moving a pointer to operation_rseq_cs (see below)
// to the kernel-registered rseq object.
// After an abort, we also jump to this label (restart_ip)
.restart_ip:
// FIXME: Update rseq->rseq_cs
// The restartable sequence
// Implements: [rseq->cpu_id].counter ++;
.start_ip: // Start of restartable sequence
// HINT: Structure of rseq is documented in /usr/include/linux/rseq.h
// HINT: rseq->cpu_id == 4(%rdi)
// HINT: Each counter-cache-line is 64 bytes lon
.end_ip: // End of restartable sequence
ret
// The abort trampoline
// Before the abort label, the kernel will check if a specific
// signature is present. We hide this signature in a
// well-crafted assembler instruction.
// ud1 <sig>(%rip),%edi
.byte 0x0f, 0xb9, 0x3d
.long 0x53053053 // RSEQ_SIG
.abort_ip: // On abort, the kernel will jump here
// FIXME: count aborts in %eax
jmp .restart_ip
// } End of operation_rseq()
.cfi_endproc
.size operation_rseq, .-operation_rseq
// struct rseq_cs operation_rseq_cs -- descriptor for our rseq
.section .data.rel.local,"aw"
.align 32
.type operation_rseq_cs, @object
.size operation_rseq_cs, 32
operation_rseq_cs:
.long 0 // __u32 version
.long 0 // __u32 flags
.quad .start_ip // __u64 start_ip
.quad .end_ip - .start_ip // __u64 post_commit_offset
.quad .abort_ip // __u64 abort_ip
.section .note.GNU-stack,"",@progbits
.\" Copyright 2015-2020 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
.\"
.\" %%%LICENSE_START(VERBATIM)
.\" Permission is granted to make and distribute verbatim copies of this
.\" manual provided the copyright notice and this permission notice are
.\" preserved on all copies.
.\"
.\" Permission is granted to copy and distribute modified versions of this
.\" manual under the conditions for verbatim copying, provided that the
.\" entire resulting derived work is distributed under the terms of a
.\" permission notice identical to this one.
.\"
.\" Since the Linux kernel and libraries are constantly changing, this
.\" manual page may be incorrect or out-of-date. The author(s) assume no
.\" responsibility for errors or omissions, or for damages resulting from
.\" the use of the information contained herein. The author(s) may not
.\" have taken the same level of care in the production of this manual,
.\" which is licensed free of charge, as they might when working
.\" professionally.
.\"
.\" Formatted or processed versions of this manual, if unaccompanied by
.\" the source, must acknowledge the copyright and authors of this work.
.\" %%%LICENSE_END
.\"
.TH RSEQ 2 2020-06-05 "Linux" "Linux Programmer's Manual"
.SH NAME
rseq \- Restartable sequences and cpu number cache
.SH SYNOPSIS
.nf
.B #include <linux/rseq.h>
.sp
.BI "int rseq(struct rseq * " rseq ", uint32_t " rseq_len ", int " flags ", uint32_t " sig ");
.sp
.SH DESCRIPTION
A restartable sequence is a sequence of instructions guaranteed to be executed
atomically with respect to other threads and signal handlers on the current
CPU. If its execution does not complete atomically, the kernel changes the
execution flow by jumping to an abort handler defined by user-space for that
restartable sequence.
Using restartable sequences requires to register a
.BR __rseq_abi
thread-local storage data structure (struct rseq) through the
.BR rseq ()
system call. Only one
.BR __rseq_abi
can be registered per thread, so user-space libraries and applications must
follow a user-space ABI defining how to share this resource. The ABI defining
how to share this resource between applications and libraries is defined by the
C library.
The
.BR __rseq_abi
contains a
.I rseq_cs
field which points to the currently executing critical section. For each
thread, a single rseq critical section can run at any given point. Each
critical section need to be implemented in assembly.
The
.BR rseq ()
ABI accelerates user-space operations on per-cpu data by defining a
shared data structure ABI between each user-space thread and the kernel.
It allows user-space to perform update operations on per-cpu data
without requiring heavy-weight atomic operations.
The term CPU used in this documentation refers to a hardware execution
context. For instance, each CPU number returned by
.BR sched_getcpu ()
is a CPU. The current CPU means to the CPU on which the registered thread is
running.
Restartable sequences are atomic with respect to preemption (making it
atomic with respect to other threads running on the same CPU), as well
as signal delivery (user-space execution contexts nested over the same
thread). They either complete atomically with respect to preemption on
the current CPU and signal delivery, or they are aborted.
Restartable sequences are suited for update operations on per-cpu data.
Restartable sequences can be used on data structures shared between threads
within a process, and on data structures shared between threads across
different processes.
.PP
Some examples of operations that can be accelerated or improved
by this ABI:
.IP \[bu] 2
Memory allocator per-cpu free-lists,
.IP \[bu] 2
Querying the current CPU number,
.IP \[bu] 2
Incrementing per-CPU counters,
.IP \[bu] 2
Modifying data protected by per-CPU spinlocks,
.IP \[bu] 2
Inserting/removing elements in per-CPU linked-lists,
.IP \[bu] 2
Writing/reading per-CPU ring buffers content.
.IP \[bu] 2
Accurately reading performance monitoring unit counters
with respect to thread migration.
.PP
Restartable sequences must not perform system calls. Doing so may result
in termination of the process by a segmentation fault.
.PP
The
.I rseq
argument is a pointer to the thread-local rseq structure to be shared
between kernel and user-space.
.PP
The layout of
.B struct rseq
is as follows:
.TP
.B Structure alignment
This structure is aligned on 32-byte boundary.
.TP
.B Structure size
This structure is fixed-size (32 bytes). Its size is passed as parameter to the
rseq system call.
.PP
.in +8n
.EX
struct rseq {
__u32 cpu_id_start;
__u32 cpu_id;
union {
/* Edited out for conciseness. [...] */
} rseq_cs;
__u32 flags;
} __attribute__((aligned(32)));
.EE
.TP
.B Fields
.TP
.in +4n
.I cpu_id_start
Optimistic cache of the CPU number on which the registered thread is
running. Its value is guaranteed to always be a possible CPU number,
even when rseq is not registered. Its value should always be confirmed by
reading the cpu_id field before user-space performs any side-effect (e.g.
storing to memory).
This field is an optimistic cache in the sense that it is always
guaranteed to hold a valid CPU number in the range [ 0 ..
nr_possible_cpus - 1 ]. It can therefore be loaded by user-space and
used as an offset in per-cpu data structures without having to
check whether its value is within the valid bounds compared to the
number of possible CPUs in the system.
Initialized by user-space to a possible CPU number (e.g., 0), updated
by the kernel for threads registered with rseq.
For user-space applications executed on a kernel without rseq support,
the cpu_id_start field stays initialized at 0, which is indeed a valid
CPU number. It is therefore valid to use it as an offset in per-cpu data
structures, and only validate whether it's actually the current CPU
number by comparing it with the cpu_id field within the rseq critical
section. If the kernel does not provide rseq support, that cpu_id field
stays initialized at -1, so the comparison always fails, as intended.
It is up to user-space to implement a fall-back mechanism for scenarios where
rseq is not available.
.in
.TP
.in +4n
.I cpu_id
Cache of the CPU number on which the registered thread is running. Initialized
by user-space to -1, updated by the kernel for threads registered with rseq.
.in
.TP
.in +4n
.I rseq_cs
The rseq_cs field is a pointer to a struct rseq_cs. Is is NULL when no
rseq assembly block critical section is active for the registered thread.
Setting it to point to a critical section descriptor (struct rseq_cs)
marks the beginning of the critical section.
Initialized by user-space to NULL.
Updated by user-space, which sets the address of the currently
active rseq_cs at the beginning of assembly instruction sequence
block, and set to NULL by the kernel when it restarts an assembly
instruction sequence block, as well as when the kernel detects that
it is preempting or delivering a signal outside of the range
targeted by the rseq_cs. Also needs to be set to NULL by user-space
before reclaiming memory that contains the targeted struct rseq_cs.
Read and set by the kernel.
.in
.TP
.in +4n
.I flags
Flags indicating the restart behavior for the registered thread. This is
mainly used for debugging purposes. Can be a combination of:
.IP \[bu]
RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT: Inhibit instruction sequence block restart
on preemption for this thread.
.IP \[bu]
RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL: Inhibit instruction sequence block restart
on signal delivery for this thread.
.IP \[bu]
RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE: Inhibit instruction sequence block restart
on migration for this thread.
.in
Initialized by user-space, used by the kernel.
.PP
The layout of
.B struct rseq_cs
version 0 is as follows:
.TP
.B Structure alignment
This structure is aligned on 32-byte boundary.
.TP
.B Structure size
This structure has a fixed size of 32 bytes.
.PP
.in +8n
.EX
struct rseq_cs {
__u32 version;
__u32 flags;
__u64 start_ip;
__u64 post_commit_offset;
__u64 abort_ip;
} __attribute__((aligned(32)));
.EE
.TP
.B Fields
.TP
.in +4n
.I version
Version of this structure. Should be initialized to 0.
.in
.TP
.in +4n
.I flags
Flags indicating the restart behavior of this structure. Can be a combination
of:
.IP \[bu]
RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT: Inhibit instruction sequence block restart
on preemption for this critical section.
.IP \[bu]
RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL: Inhibit instruction sequence block restart
on signal delivery for this critical section.
.IP \[bu]
RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE: Inhibit instruction sequence block restart
on migration for this critical section.
.TP
.in +4n
.I start_ip
Instruction pointer address of the first instruction of the sequence of
consecutive assembly instructions.
.in
.TP
.in +4n
.I post_commit_offset
Offset (from start_ip address) of the address after the last instruction
of the sequence of consecutive assembly instructions.
.in
.TP
.in +4n
.I abort_ip
Instruction pointer address where to move the execution flow in case of
abort of the sequence of consecutive assembly instructions.
.in
.PP
The
.I rseq_len
argument is the size of the
.I struct rseq
to register.
.PP
The
.I flags
argument is 0 for registration, and
.IR RSEQ_FLAG_UNREGISTER
for unregistration.
.PP
The
.I sig
argument is the 32-bit signature to be expected before the abort
handler code.
.PP
A single library per process should keep the rseq structure in a
thread-local storage variable.
The
.I cpu_id
field should be initialized to -1, and the
.I cpu_id_start
field should be initialized to a possible CPU value (typically 0).
.PP
Each thread is responsible for registering and unregistering its rseq
structure. No more than one rseq structure address can be registered
per thread at a given time.
.PP
Reclaim of rseq object's memory must only be done after either an
explicit rseq unregistration is performed or after the thread exits.
.PP
In a typical usage scenario, the thread registering the rseq
structure will be performing loads and stores from/to that structure. It
is however also allowed to read that structure from other threads.
The rseq field updates performed by the kernel provide relaxed atomicity
semantics (atomic store, without memory ordering), which guarantee that other
threads performing relaxed atomic reads (atomic load, without memory ordering)
of the cpu number cache will always observe a consistent value.
.SH RETURN VALUE
A return value of 0 indicates success. On error, \-1 is returned, and
.I errno
is set appropriately.
.SH ERRORS
.TP
.B EINVAL
Either
.I flags
contains an invalid value, or
.I rseq
contains an address which is not appropriately aligned, or
.I rseq_len
contains an incorrect size.
.TP
.B ENOSYS
The
.BR rseq ()
system call is not implemented by this kernel.
.TP
.B EFAULT
.I rseq
is an invalid address.
.TP
.B EBUSY
Restartable sequence is already registered for this thread.
.TP
.B EPERM
The
.I sig
argument on unregistration does not match the signature received
on registration.
.SH VERSIONS
The
.BR rseq ()
system call was added in Linux 4.18.
.SH CONFORMING TO
.BR rseq ()
is Linux-specific.
.in
.SH SEE ALSO
.BR sched_getcpu (3) ,
.BR membarrier (2)
\ No newline at end of file
#define _GNU_SOURCE
#include <pthread.h>
#include <unistd.h>
#include <inttypes.h>
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <sys/syscall.h>
#include <stdatomic.h>
#include <string.h>
#include <sys/sysinfo.h>
#include <stdbool.h>
#include <malloc.h>
#define die(msg) do { perror(msg); exit(EXIT_FAILURE); } while(0)
// With modern glibc versions (>2.35), the glibc already registers
// a rseq area for each thread that is started with
// pthread_create. In that case, we derive this pointer from the
// thread_pointer(). For details, see
// https://www.gnu.org/software/libc/manual/html_node/Restartable-Sequences.html
#if __has_include ("sys/rseq.h")
#include <sys/rseq.h>
#if RSEQ_SIG != 0x53053053
#error "glibc defined RSEQ_SIG differently"
#endif
static struct rseq * rseq_register() {
return __builtin_thread_pointer() + __rseq_offset;
}
#else
#include <linux/rseq.h>
#define RSEQ_SIG 0x53053053
// The rseq(2) syscall has no glibc wrapper. Therefore, we define our
// own. Please run `make man` to see the man page rseq(2).
int sys_rseq(struct rseq * rseq, uint32_t rseq_len, int flags, uint32_t sig) {
return syscall(SYS_rseq, rseq, rseq_len, flags, RSEQ_SIG);
}
struct rseq *rseq_register() {
struct rseq *ret = memalign(sizeof(struct rseq), sizeof(struct rseq));
memset(ret, 0, sizeof(struct rseq));
ret->cpu_id_start = -1;
ret->cpu_id = -1;
if (sys_rseq(ret, sizeof(struct rseq), 0, 0) < 0)
die("rseq");
return ret;
}
#endif
// This data structure is exactly one cache-line wide (assuming that a
// cache line is 64 bytes). Thereby, we can allocate an array of
// cpu-local counters, where each CPU only operates on a single cache
// line. Thereby, we can avoid most side effects of cache-line
// transfers.
struct cacheline {
union {
char data[64];
struct {
uint64_t counter;
pthread_mutex_t mutex; // Used in the lock variant
};
};
};
// We will define multiple operation_t functions that all implement
// the same behavior: They increment a cpu-local counter by 1.
typedef int (*operation_t)(struct rseq *_, struct cacheline *);
// The simplest variant of a CPU-local counter is to get the cpuid
// with getcpu() and increment the counter. However, due to the
// read-update-write cycle, this variant is racy and will produce
// incorrect results.
int operation_regular(struct rseq*_, struct cacheline *counters) {
unsigned int cpu_id;
getcpu(&cpu_id, NULL);
counters[cpu_id].counter += 1;
return 0;
}
// A correct, but slow variant uses the cache-line--local pthread
// mutex to lock the counter for the time of the operation.
int operation_lock(struct rseq*_, struct cacheline *counters) {
unsigned int cpu_id;
getcpu(&cpu_id, NULL);
pthread_mutex_lock(&counters[cpu_id].mutex);
counters[cpu_id].counter += 1;
pthread_mutex_unlock(&counters[cpu_id].mutex);
return 0;
}
// Variant that uses getcpu() + atomic_fetch_add
int operation_atomic(struct rseq* _, struct cacheline *counters) {
// FIXME: Implement variant
return 0;
}
// Variant without getcpu: Like operation_atomic, but uses the
// restartable sequence to retrieve the cpu id.
// Please look at /usr/include/linux/rseq.h for the documentation of struct rseq
int operation_rseq_atomic(struct rseq* rs, struct cacheline *counters) {
// FIXME: Implement variant
return 0;
}
// Variant that uses no atomic operations and fully relies on rseq
// This variant is implemented in assembler (see rseq.S)
extern int operation_rseq(struct rseq *, struct cacheline*);
// FIXME: Implement counter_rseq in rseq.S
////////////////////////////////////////////////////////////////
// The Benchmarking code
//
// We start NTHREADS threads and each thread executes
// ROUNDS_PER_THREAD cpu-local increments
int ROUNDS_PER_THREAD = 50000000;
struct thread_args {
operation_t operation;
struct cacheline *counters;
};
void* thread_handler(void* data) {
struct thread_args *args = data;
// Register rseq area or use glibc's rseq
struct rseq *rseq = rseq_register();
printf("rseq: %p\n", rseq);
// Execute the given operation ROUNDS_PER_THREAD times and count
// the number of aborts (only != 0 for rseq)
uint64_t aborts = 0;
for (uint64_t i = 0; i < ROUNDS_PER_THREAD; i++) {
aborts += args->operation(rseq, args->counters);
}
// Return the number of rseq aborts
return (void*) aborts;
}
// Print usage and exit.
static void usage(char *argv0) {
fprintf(stderr, "usage: %s <threads> <regular|lock|getcpu-atomic|rseq-atomic|rseq> [rounds]\n", argv0);
exit(EXIT_FAILURE);
}
int main(int argc, char *argv[]) {
// Parameter Parsing. This is boring
if (argc < 3) usage(argv[0]);
if (argc == 4)
ROUNDS_PER_THREAD *= atoi(argv[3]);
int CPUS = get_nprocs();
int NTHREADS = atoi(argv[1]);
char *MODE = argv[2];
struct thread_args args;
if (!strcmp(MODE, "rseq")) args.operation = operation_rseq;
else if (!strcmp(MODE, "getcpu-atomic")) args.operation = operation_atomic;
else if (!strcmp(MODE, "rseq-atomic")) args.operation = operation_rseq_atomic;
else if (!strcmp(MODE, "regular")) args.operation = operation_regular;
else if (!strcmp(MODE, "lock")) args.operation = operation_lock;
else usage(argv[0]);
// Initialize the CPU-local counters. Each CPU gets an struct
// cache-line on its own. We use aligned_alloc(3) to get
// cache-line-aligned memory from the allocator.
args.counters = aligned_alloc(sizeof(struct cacheline), CPUS * sizeof(struct cacheline));
if (!args.counters) die("calloc");
// Initialize locks for the lock variant
for (uint32_t i = 0; i < CPUS; i++) {
pthread_mutex_init(&args.counters[i].mutex, NULL);
}
// The actual benchmarking code
////////////////////////////////////////////////////////////////
struct timespec start, end;
// Start Time. We use the CLOCK_PROCESS_CPUTIME_ID to get the
// number of CPU-seconds spent.
if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start) < 0)
die("clock_gettime");
// Create NTHREADS threads
pthread_t threads[NTHREADS];
for (uint32_t i = 0; i < NTHREADS; i++) {
pthread_create(&threads[i], NULL, thread_handler, (void*)&args);
}
// Wait for all threads to complete and accumulate the number of aborts
uint64_t aborts = 0;
for (uint32_t i = 0; i < NTHREADS; i++) {
uint64_t thread_aborts;
pthread_join(threads[i], (void**)&thread_aborts);
aborts += thread_aborts;
}
// End Time
if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end) < 0)
die("clock_gettime");
// Calculate the time delta between both points in time.
double delta = end.tv_sec - start.tv_sec;
delta += (end.tv_nsec - start.tv_nsec) / 1e9;
// Print out the cpu-local counters. With this output and a low
// number of threads you can see the thread migration.
uint64_t sum = 0;
for (uint32_t i = 0; i < CPUS; i++) {
fprintf(stderr, "counter[cpu=%d] = %ld\n", i, args.counters[i].counter);
sum += args.counters[i].counter;
}
// Print out the result. We also check that the threads actually
// counted correctly (state)
printf("mode=%s threads=%d sum=%ld state=%s aborts=%ld cputime=%fs per_increment=%fns\n",
MODE, NTHREADS,
sum, (sum % ROUNDS_PER_THREAD) == 0 ? "ok" : "fail",
aborts,
delta, // total cpu time that was spent
delta * 1e9 / sum // nanoseconds per increment
);
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment