diff --git a/23-rseq/Makefile b/23-rseq/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..10c97b3a9251fe6ee9f88d8567fcd7a5d2e916c1 --- /dev/null +++ b/23-rseq/Makefile @@ -0,0 +1,23 @@ +PROG = rseq + +${PROG}: ${PROG}.o rseq-asm.o Makefile + gcc ${PROG}.o rseq-asm.o -o $@ -Wall -g -Wno-unused-function -O3 -lpthread + +%.o: %.c + gcc -c $< -o $@ -Wall -g -Wno-unused-function -O3 + +%.o: %.S + gcc -c $< -o $@ + +run: ${PROG} + ./${PROG} 32 regular + ./${PROG} 32 lock + +clean: + rm -f ./${PROG} *.o + +man: + man ./rseq.2 + +plot: + ./plot.py data diff --git a/23-rseq/benchmark.sh b/23-rseq/benchmark.sh new file mode 100644 index 0000000000000000000000000000000000000000..937608a39201423e1e754108fb482415d7994be4 --- /dev/null +++ b/23-rseq/benchmark.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +rm data + +for mode in rseq-atomic rseq regular lock getcpu-atomic ; do + for threads in `seq 1 32`; do + ./rseq $threads $mode 8 >> data + done +done + +echo "Run: ./plot.py data" diff --git a/23-rseq/data.Ryzen7_PRO_5850U b/23-rseq/data.Ryzen7_PRO_5850U new file mode 100644 index 0000000000000000000000000000000000000000..d0a00babc3ebd7242f040bb49f4415602201fb99 --- /dev/null +++ b/23-rseq/data.Ryzen7_PRO_5850U @@ -0,0 +1,160 @@ +mode=rseq-atomic threads=1 sum=400000000 state=ok aborts=0 cputime=2.199005s per_increment=5.497513ns +mode=rseq-atomic threads=2 sum=800000000 state=ok aborts=0 cputime=3.284306s per_increment=4.105382ns +mode=rseq-atomic threads=3 sum=1200000000 state=ok aborts=0 cputime=4.448196s per_increment=3.706830ns +mode=rseq-atomic threads=4 sum=1600000000 state=ok aborts=0 cputime=8.446127s per_increment=5.278830ns +mode=rseq-atomic threads=5 sum=2000000000 state=ok aborts=0 cputime=9.053520s per_increment=4.526760ns +mode=rseq-atomic threads=6 sum=2400000000 state=ok aborts=0 cputime=12.195459s per_increment=5.081441ns +mode=rseq-atomic threads=7 sum=2800000000 state=ok aborts=0 cputime=11.748222s per_increment=4.195793ns +mode=rseq-atomic threads=8 sum=3200000000 state=ok aborts=0 cputime=15.032143s per_increment=4.697545ns +mode=rseq-atomic threads=9 sum=3600000000 state=ok aborts=0 cputime=16.116472s per_increment=4.476798ns +mode=rseq-atomic threads=10 sum=4000000000 state=ok aborts=0 cputime=20.672133s per_increment=5.168033ns +mode=rseq-atomic threads=11 sum=4400000000 state=ok aborts=0 cputime=23.798913s per_increment=5.408844ns +mode=rseq-atomic threads=12 sum=4800000000 state=ok aborts=0 cputime=26.946868s per_increment=5.613931ns +mode=rseq-atomic threads=13 sum=5200000000 state=ok aborts=0 cputime=29.666975s per_increment=5.705187ns +mode=rseq-atomic threads=14 sum=5600000000 state=ok aborts=0 cputime=33.483891s per_increment=5.979266ns +mode=rseq-atomic threads=15 sum=6000000000 state=ok aborts=0 cputime=37.403459s per_increment=6.233910ns +mode=rseq-atomic threads=16 sum=6400000000 state=ok aborts=0 cputime=40.505500s per_increment=6.328984ns +mode=rseq-atomic threads=17 sum=6800000000 state=ok aborts=0 cputime=42.792953s per_increment=6.293081ns +mode=rseq-atomic threads=18 sum=7200000000 state=ok aborts=0 cputime=45.171370s per_increment=6.273801ns +mode=rseq-atomic threads=19 sum=7600000000 state=ok aborts=0 cputime=47.448919s per_increment=6.243279ns +mode=rseq-atomic threads=20 sum=8000000000 state=ok aborts=0 cputime=49.511613s per_increment=6.188952ns +mode=rseq-atomic threads=21 sum=8400000000 state=ok aborts=0 cputime=52.517915s per_increment=6.252133ns +mode=rseq-atomic threads=22 sum=8800000000 state=ok aborts=0 cputime=55.319380s per_increment=6.286293ns +mode=rseq-atomic threads=23 sum=9200000000 state=ok aborts=0 cputime=57.941721s per_increment=6.298013ns +mode=rseq-atomic threads=24 sum=9600000000 state=ok aborts=0 cputime=60.357880s per_increment=6.287279ns +mode=rseq-atomic threads=25 sum=10000000000 state=ok aborts=0 cputime=62.908038s per_increment=6.290804ns +mode=rseq-atomic threads=26 sum=10400000000 state=ok aborts=0 cputime=65.316597s per_increment=6.280442ns +mode=rseq-atomic threads=27 sum=10800000000 state=ok aborts=0 cputime=68.210331s per_increment=6.315771ns +mode=rseq-atomic threads=28 sum=11200000000 state=ok aborts=0 cputime=71.145485s per_increment=6.352275ns +mode=rseq-atomic threads=29 sum=11600000000 state=ok aborts=0 cputime=74.014644s per_increment=6.380573ns +mode=rseq-atomic threads=30 sum=12000000000 state=ok aborts=0 cputime=80.516616s per_increment=6.709718ns +mode=rseq-atomic threads=31 sum=12400000000 state=ok aborts=0 cputime=78.501505s per_increment=6.330767ns +mode=rseq-atomic threads=32 sum=12800000000 state=ok aborts=0 cputime=81.190299s per_increment=6.342992ns +mode=rseq threads=1 sum=400000000 state=ok aborts=0 cputime=2.483927s per_increment=6.209818ns +mode=rseq threads=2 sum=800000000 state=ok aborts=1 cputime=2.927913s per_increment=3.659891ns +mode=rseq threads=3 sum=1200000000 state=ok aborts=0 cputime=4.631658s per_increment=3.859715ns +mode=rseq threads=4 sum=1600000000 state=ok aborts=2 cputime=9.087390s per_increment=5.679619ns +mode=rseq threads=5 sum=2000000000 state=ok aborts=1 cputime=7.195553s per_increment=3.597776ns +mode=rseq threads=6 sum=2400000000 state=ok aborts=0 cputime=9.132219s per_increment=3.805091ns +mode=rseq threads=7 sum=2800000000 state=ok aborts=0 cputime=10.669462s per_increment=3.810522ns +mode=rseq threads=8 sum=3200000000 state=ok aborts=2 cputime=14.527707s per_increment=4.539909ns +mode=rseq threads=9 sum=3600000000 state=ok aborts=0 cputime=15.470640s per_increment=4.297400ns +mode=rseq threads=10 sum=4000000000 state=ok aborts=3 cputime=18.031536s per_increment=4.507884ns +mode=rseq threads=11 sum=4400000000 state=ok aborts=1 cputime=20.358866s per_increment=4.627015ns +mode=rseq threads=12 sum=4800000000 state=ok aborts=4 cputime=23.474872s per_increment=4.890598ns +mode=rseq threads=13 sum=5200000000 state=ok aborts=5 cputime=29.528908s per_increment=5.678636ns +mode=rseq threads=14 sum=5600000000 state=ok aborts=8 cputime=30.623857s per_increment=5.468546ns +mode=rseq threads=15 sum=6000000000 state=ok aborts=11 cputime=33.960129s per_increment=5.660021ns +mode=rseq threads=16 sum=6400000000 state=ok aborts=14 cputime=36.712009s per_increment=5.736251ns +mode=rseq threads=17 sum=6800000000 state=ok aborts=18 cputime=38.662291s per_increment=5.685631ns +mode=rseq threads=18 sum=7200000000 state=ok aborts=22 cputime=40.966459s per_increment=5.689786ns +mode=rseq threads=19 sum=7600000000 state=ok aborts=24 cputime=42.702562s per_increment=5.618758ns +mode=rseq threads=20 sum=8000000000 state=ok aborts=17 cputime=45.202435s per_increment=5.650304ns +mode=rseq threads=21 sum=8400000000 state=ok aborts=38 cputime=48.130558s per_increment=5.729828ns +mode=rseq threads=22 sum=8800000000 state=ok aborts=48 cputime=50.154982s per_increment=5.699430ns +mode=rseq threads=23 sum=9200000000 state=ok aborts=61 cputime=53.364385s per_increment=5.800477ns +mode=rseq threads=24 sum=9600000000 state=ok aborts=57 cputime=54.841167s per_increment=5.712622ns +mode=rseq threads=25 sum=10000000000 state=ok aborts=21 cputime=56.911656s per_increment=5.691166ns +mode=rseq threads=26 sum=10400000000 state=ok aborts=39 cputime=59.314805s per_increment=5.703347ns +mode=rseq threads=27 sum=10800000000 state=ok aborts=58 cputime=61.900339s per_increment=5.731513ns +mode=rseq threads=28 sum=11200000000 state=ok aborts=42 cputime=63.666270s per_increment=5.684488ns +mode=rseq threads=29 sum=11600000000 state=ok aborts=56 cputime=66.452641s per_increment=5.728676ns +mode=rseq threads=30 sum=12000000000 state=ok aborts=87 cputime=68.713723s per_increment=5.726144ns +mode=rseq threads=31 sum=12400000000 state=ok aborts=48 cputime=70.797445s per_increment=5.709471ns +mode=rseq threads=32 sum=12800000000 state=ok aborts=69 cputime=73.319152s per_increment=5.728059ns +mode=regular threads=1 sum=400000000 state=ok aborts=0 cputime=4.445797s per_increment=11.114493ns +mode=regular threads=2 sum=800000000 state=ok aborts=0 cputime=10.181069s per_increment=12.726336ns +mode=regular threads=3 sum=1200000000 state=ok aborts=0 cputime=15.207685s per_increment=12.673070ns +mode=regular threads=4 sum=1600000000 state=ok aborts=0 cputime=20.394730s per_increment=12.746707ns +mode=regular threads=5 sum=2000000000 state=ok aborts=0 cputime=25.606271s per_increment=12.803136ns +mode=regular threads=6 sum=2400000000 state=ok aborts=0 cputime=30.550001s per_increment=12.729167ns +mode=regular threads=7 sum=2800000000 state=ok aborts=0 cputime=31.396464s per_increment=11.213023ns +mode=regular threads=8 sum=3200000000 state=ok aborts=0 cputime=40.884241s per_increment=12.776325ns +mode=regular threads=9 sum=3600000000 state=ok aborts=0 cputime=48.727290s per_increment=13.535358ns +mode=regular threads=10 sum=4000000000 state=ok aborts=0 cputime=57.116025s per_increment=14.279006ns +mode=regular threads=11 sum=4400000000 state=ok aborts=0 cputime=65.007191s per_increment=14.774362ns +mode=regular threads=12 sum=4800000000 state=ok aborts=0 cputime=72.886369s per_increment=15.184660ns +mode=regular threads=13 sum=5200000000 state=ok aborts=0 cputime=82.631048s per_increment=15.890586ns +mode=regular threads=14 sum=5600000000 state=ok aborts=0 cputime=83.803839s per_increment=14.964971ns +mode=regular threads=15 sum=6000000000 state=ok aborts=0 cputime=101.826165s per_increment=16.971027ns +mode=regular threads=16 sum=6399999999 state=fail aborts=0 cputime=103.622771s per_increment=16.191058ns +mode=regular threads=17 sum=6799999980 state=fail aborts=0 cputime=117.516558s per_increment=17.281847ns +mode=regular threads=18 sum=7199999984 state=fail aborts=0 cputime=124.609096s per_increment=17.306819ns +mode=regular threads=19 sum=7599999959 state=fail aborts=0 cputime=131.441945s per_increment=17.294993ns +mode=regular threads=20 sum=7999999972 state=fail aborts=0 cputime=127.853660s per_increment=15.981708ns +mode=regular threads=21 sum=8399999947 state=fail aborts=0 cputime=145.638661s per_increment=17.337936ns +mode=regular threads=22 sum=8799999943 state=fail aborts=0 cputime=152.054029s per_increment=17.278867ns +mode=regular threads=23 sum=9199999933 state=fail aborts=0 cputime=159.989756s per_increment=17.390191ns +mode=regular threads=24 sum=9599999946 state=fail aborts=0 cputime=167.602853s per_increment=17.458631ns +mode=regular threads=25 sum=9999999946 state=fail aborts=0 cputime=173.688899s per_increment=17.368890ns +mode=regular threads=26 sum=10399999956 state=fail aborts=0 cputime=180.728688s per_increment=17.377759ns +mode=regular threads=27 sum=10799999959 state=fail aborts=0 cputime=187.942425s per_increment=17.402076ns +mode=regular threads=28 sum=11199999958 state=fail aborts=0 cputime=194.921763s per_increment=17.403729ns +mode=regular threads=29 sum=11599999957 state=fail aborts=0 cputime=201.910554s per_increment=17.406082ns +mode=regular threads=30 sum=11999999966 state=fail aborts=0 cputime=208.667108s per_increment=17.388926ns +mode=regular threads=31 sum=12399999964 state=fail aborts=0 cputime=218.638442s per_increment=17.632132ns +mode=regular threads=32 sum=12799999939 state=fail aborts=0 cputime=222.786872s per_increment=17.405224ns +mode=lock threads=1 sum=400000000 state=ok aborts=0 cputime=9.739219s per_increment=24.348048ns +mode=lock threads=2 sum=800000000 state=ok aborts=0 cputime=19.477206s per_increment=24.346508ns +mode=lock threads=3 sum=1200000000 state=ok aborts=0 cputime=29.639653s per_increment=24.699711ns +mode=lock threads=4 sum=1600000000 state=ok aborts=0 cputime=39.447133s per_increment=24.654458ns +mode=lock threads=5 sum=2000000000 state=ok aborts=0 cputime=49.416199s per_increment=24.708099ns +mode=lock threads=6 sum=2400000000 state=ok aborts=0 cputime=58.912073s per_increment=24.546697ns +mode=lock threads=7 sum=2800000000 state=ok aborts=0 cputime=69.373970s per_increment=24.776418ns +mode=lock threads=8 sum=3200000000 state=ok aborts=0 cputime=80.045758s per_increment=25.014299ns +mode=lock threads=9 sum=3600000000 state=ok aborts=0 cputime=95.506874s per_increment=26.529687ns +mode=lock threads=10 sum=4000000000 state=ok aborts=0 cputime=105.647129s per_increment=26.411782ns +mode=lock threads=11 sum=4400000000 state=ok aborts=0 cputime=119.966161s per_increment=27.265037ns +mode=lock threads=12 sum=4800000000 state=ok aborts=0 cputime=144.790100s per_increment=30.164604ns +mode=lock threads=13 sum=5200000000 state=ok aborts=0 cputime=169.275312s per_increment=32.552945ns +mode=lock threads=14 sum=5600000000 state=ok aborts=0 cputime=184.008284s per_increment=32.858622ns +mode=lock threads=15 sum=6000000000 state=ok aborts=0 cputime=206.639565s per_increment=34.439927ns +mode=lock threads=16 sum=6400000000 state=ok aborts=0 cputime=220.186430s per_increment=34.404130ns +mode=lock threads=17 sum=6800000000 state=ok aborts=0 cputime=243.215361s per_increment=35.766965ns +mode=lock threads=18 sum=7200000000 state=ok aborts=0 cputime=256.791891s per_increment=35.665540ns +mode=lock threads=19 sum=7600000000 state=ok aborts=0 cputime=270.841824s per_increment=35.637082ns +mode=lock threads=20 sum=8000000000 state=ok aborts=0 cputime=286.147054s per_increment=35.768382ns +mode=lock threads=21 sum=8400000000 state=ok aborts=0 cputime=303.063539s per_increment=36.078993ns +mode=lock threads=22 sum=8800000000 state=ok aborts=0 cputime=334.728624s per_increment=38.037344ns +mode=lock threads=23 sum=9200000000 state=ok aborts=0 cputime=365.422783s per_increment=39.719868ns +mode=lock threads=24 sum=9600000000 state=ok aborts=0 cputime=383.564521s per_increment=39.954638ns +mode=lock threads=25 sum=10000000000 state=ok aborts=0 cputime=400.088520s per_increment=40.008852ns +mode=lock threads=26 sum=10400000000 state=ok aborts=0 cputime=413.845634s per_increment=39.792849ns +mode=lock threads=27 sum=10800000000 state=ok aborts=0 cputime=432.694245s per_increment=40.064282ns +mode=lock threads=28 sum=11200000000 state=ok aborts=0 cputime=448.437452s per_increment=40.039058ns +mode=lock threads=29 sum=11600000000 state=ok aborts=0 cputime=479.398819s per_increment=41.327484ns +mode=lock threads=30 sum=12000000000 state=ok aborts=0 cputime=485.327195s per_increment=40.443933ns +mode=lock threads=31 sum=12400000000 state=ok aborts=0 cputime=498.999020s per_increment=40.241856ns +mode=lock threads=32 sum=12800000000 state=ok aborts=0 cputime=518.804851s per_increment=40.531629ns +mode=getcpu-atomic threads=1 sum=400000000 state=ok aborts=0 cputime=4.590758s per_increment=11.476895ns +mode=getcpu-atomic threads=2 sum=800000000 state=ok aborts=0 cputime=10.373209s per_increment=12.966511ns +mode=getcpu-atomic threads=3 sum=1200000000 state=ok aborts=0 cputime=15.454717s per_increment=12.878931ns +mode=getcpu-atomic threads=4 sum=1600000000 state=ok aborts=0 cputime=18.276674s per_increment=11.422921ns +mode=getcpu-atomic threads=5 sum=2000000000 state=ok aborts=0 cputime=25.616891s per_increment=12.808445ns +mode=getcpu-atomic threads=6 sum=2400000000 state=ok aborts=0 cputime=30.558029s per_increment=12.732512ns +mode=getcpu-atomic threads=7 sum=2800000000 state=ok aborts=0 cputime=35.695183s per_increment=12.748280ns +mode=getcpu-atomic threads=8 sum=3200000000 state=ok aborts=0 cputime=41.202231s per_increment=12.875697ns +mode=getcpu-atomic threads=9 sum=3600000000 state=ok aborts=0 cputime=48.857853s per_increment=13.571626ns +mode=getcpu-atomic threads=10 sum=4000000000 state=ok aborts=0 cputime=57.148052s per_increment=14.287013ns +mode=getcpu-atomic threads=11 sum=4400000000 state=ok aborts=0 cputime=65.426838s per_increment=14.869736ns +mode=getcpu-atomic threads=12 sum=4800000000 state=ok aborts=0 cputime=74.198416s per_increment=15.458003ns +mode=getcpu-atomic threads=13 sum=5200000000 state=ok aborts=0 cputime=88.584374s per_increment=17.035457ns +mode=getcpu-atomic threads=14 sum=5600000000 state=ok aborts=0 cputime=97.566401s per_increment=17.422572ns +mode=getcpu-atomic threads=15 sum=6000000000 state=ok aborts=0 cputime=109.718405s per_increment=18.286401ns +mode=getcpu-atomic threads=16 sum=6400000000 state=ok aborts=0 cputime=122.916752s per_increment=19.205742ns +mode=getcpu-atomic threads=17 sum=6800000000 state=ok aborts=0 cputime=134.455988s per_increment=19.772939ns +mode=getcpu-atomic threads=18 sum=7200000000 state=ok aborts=0 cputime=140.480629s per_increment=19.511198ns +mode=getcpu-atomic threads=19 sum=7600000000 state=ok aborts=0 cputime=146.157256s per_increment=19.231218ns +mode=getcpu-atomic threads=20 sum=8000000000 state=ok aborts=0 cputime=155.691843s per_increment=19.461480ns +mode=getcpu-atomic threads=21 sum=8400000000 state=ok aborts=0 cputime=163.030484s per_increment=19.408391ns +mode=getcpu-atomic threads=22 sum=8800000000 state=ok aborts=0 cputime=170.584687s per_increment=19.384623ns +mode=getcpu-atomic threads=23 sum=9200000000 state=ok aborts=0 cputime=176.381227s per_increment=19.171873ns +mode=getcpu-atomic threads=24 sum=9600000000 state=ok aborts=0 cputime=185.423582s per_increment=19.314956ns +mode=getcpu-atomic threads=25 sum=10000000000 state=ok aborts=0 cputime=192.877967s per_increment=19.287797ns +mode=getcpu-atomic threads=26 sum=10400000000 state=ok aborts=0 cputime=199.917576s per_increment=19.222844ns +mode=getcpu-atomic threads=27 sum=10800000000 state=ok aborts=0 cputime=207.822609s per_increment=19.242834ns +mode=getcpu-atomic threads=28 sum=11200000000 state=ok aborts=0 cputime=216.261564s per_increment=19.309068ns +mode=getcpu-atomic threads=29 sum=11600000000 state=ok aborts=0 cputime=213.773868s per_increment=18.428782ns +mode=getcpu-atomic threads=30 sum=12000000000 state=ok aborts=0 cputime=221.522895s per_increment=18.460241ns +mode=getcpu-atomic threads=31 sum=12400000000 state=ok aborts=0 cputime=240.809263s per_increment=19.420102ns +mode=getcpu-atomic threads=32 sum=12800000000 state=ok aborts=0 cputime=251.500147s per_increment=19.648449ns diff --git a/23-rseq/plot.Ryzen7_PRO_5850U.png b/23-rseq/plot.Ryzen7_PRO_5850U.png new file mode 100644 index 0000000000000000000000000000000000000000..3314476504cef82c5c861ef8a39bd5f0d2c3ed5a Binary files /dev/null and b/23-rseq/plot.Ryzen7_PRO_5850U.png differ diff --git a/23-rseq/plot.py b/23-rseq/plot.py new file mode 100644 index 0000000000000000000000000000000000000000..c1681ecdd201846760fd9b4a98de15100f59fbef --- /dev/null +++ b/23-rseq/plot.py @@ -0,0 +1,35 @@ +#!/usr/bin/python3 + +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt + +def read_log(fn): + with open(fn) as fd: + lines = fd.readlines() + rows = [] + for line in lines: + line = line.strip().split(" ") + header = [x.split("=")[0] for x in line] + data = [x.split("=")[1].rstrip("ns") for x in line] + rows.append(data) + df = pd.DataFrame( + columns=header, + data=rows) + for x in "threads sum aborts cputime per_increment".split(): + df[x] = df[x].apply(float) + return df + +if __name__ == "__main__": + import sys + if len(sys.argv) < 2: + sys.exit("usage: %s [LOGFILE]" % sys.argv[0]) + df = read_log(sys.argv[1]) + + per_inc = df.set_index(["mode", "threads"]).per_increment.unstack().T + + ax = per_inc.plot(marker='x', grid=True,figsize=(10,10)) + ax.set_ylim((0, None)) + ax.set_ylabel("Per Increment [ns]") + ax.get_figure().savefig('plot.png') + diff --git a/23-rseq/rseq-asm.S b/23-rseq/rseq-asm.S new file mode 100644 index 0000000000000000000000000000000000000000..6a15529e4dfe8ab71e1baf8a2263ff71020c7ab8 --- /dev/null +++ b/23-rseq/rseq-asm.S @@ -0,0 +1,58 @@ +// int operation_rseq(struct rseq * rseq, struct cacheline *counters) { +// According to the Calling Convention the arguments come in registers: +// rseq: %rdi +// counter: %rsi +.p2align 4 +.globl operation_rseq +.type operation_rseq, @function +operation_rseq: +.cfi_startproc + // We will return the number of aborts in %eax. Initialize + // eax with zero + xor %eax, %eax // %eax = 0 + + // We inform the kernel that we are now within a restartable + // sequence by moving a pointer to operation_rseq_cs (see below) + // to the kernel-registered rseq object. + // After an abort, we also jump to this label (restart_ip) +.restart_ip: + // FIXME: Update rseq->rseq_cs + + // The restartable sequence + // Implements: [rseq->cpu_id].counter ++; +.start_ip: // Start of restartable sequence + // HINT: Structure of rseq is documented in /usr/include/linux/rseq.h + // HINT: rseq->cpu_id == 4(%rdi) + // HINT: Each counter-cache-line is 64 bytes lon +.end_ip: // End of restartable sequence + ret + + // The abort trampoline + + // Before the abort label, the kernel will check if a specific + // signature is present. We hide this signature in a + // well-crafted assembler instruction. + // ud1 <sig>(%rip),%edi + .byte 0x0f, 0xb9, 0x3d + .long 0x53053053 // RSEQ_SIG +.abort_ip: // On abort, the kernel will jump here + // FIXME: count aborts in %eax + jmp .restart_ip + +// } End of operation_rseq() +.cfi_endproc +.size operation_rseq, .-operation_rseq + +// struct rseq_cs operation_rseq_cs -- descriptor for our rseq +.section .data.rel.local,"aw" +.align 32 +.type operation_rseq_cs, @object +.size operation_rseq_cs, 32 +operation_rseq_cs: + .long 0 // __u32 version + .long 0 // __u32 flags + .quad .start_ip // __u64 start_ip + .quad .end_ip - .start_ip // __u64 post_commit_offset + .quad .abort_ip // __u64 abort_ip + +.section .note.GNU-stack,"",@progbits diff --git a/23-rseq/rseq.2 b/23-rseq/rseq.2 new file mode 100644 index 0000000000000000000000000000000000000000..8882e1c701a286ca5591e1d1e2324b3128f78b74 --- /dev/null +++ b/23-rseq/rseq.2 @@ -0,0 +1,371 @@ +.\" Copyright 2015-2020 Mathieu Desnoyers <mathieu.desnoyers@efficios.com> +.\" +.\" %%%LICENSE_START(VERBATIM) +.\" Permission is granted to make and distribute verbatim copies of this +.\" manual provided the copyright notice and this permission notice are +.\" preserved on all copies. +.\" +.\" Permission is granted to copy and distribute modified versions of this +.\" manual under the conditions for verbatim copying, provided that the +.\" entire resulting derived work is distributed under the terms of a +.\" permission notice identical to this one. +.\" +.\" Since the Linux kernel and libraries are constantly changing, this +.\" manual page may be incorrect or out-of-date. The author(s) assume no +.\" responsibility for errors or omissions, or for damages resulting from +.\" the use of the information contained herein. The author(s) may not +.\" have taken the same level of care in the production of this manual, +.\" which is licensed free of charge, as they might when working +.\" professionally. +.\" +.\" Formatted or processed versions of this manual, if unaccompanied by +.\" the source, must acknowledge the copyright and authors of this work. +.\" %%%LICENSE_END +.\" +.TH RSEQ 2 2020-06-05 "Linux" "Linux Programmer's Manual" +.SH NAME +rseq \- Restartable sequences and cpu number cache +.SH SYNOPSIS +.nf +.B #include <linux/rseq.h> +.sp +.BI "int rseq(struct rseq * " rseq ", uint32_t " rseq_len ", int " flags ", uint32_t " sig "); +.sp +.SH DESCRIPTION + +A restartable sequence is a sequence of instructions guaranteed to be executed +atomically with respect to other threads and signal handlers on the current +CPU. If its execution does not complete atomically, the kernel changes the +execution flow by jumping to an abort handler defined by user-space for that +restartable sequence. + +Using restartable sequences requires to register a +.BR __rseq_abi +thread-local storage data structure (struct rseq) through the +.BR rseq () +system call. Only one +.BR __rseq_abi +can be registered per thread, so user-space libraries and applications must +follow a user-space ABI defining how to share this resource. The ABI defining +how to share this resource between applications and libraries is defined by the +C library. + +The +.BR __rseq_abi +contains a +.I rseq_cs +field which points to the currently executing critical section. For each +thread, a single rseq critical section can run at any given point. Each +critical section need to be implemented in assembly. + +The +.BR rseq () +ABI accelerates user-space operations on per-cpu data by defining a +shared data structure ABI between each user-space thread and the kernel. + +It allows user-space to perform update operations on per-cpu data +without requiring heavy-weight atomic operations. + +The term CPU used in this documentation refers to a hardware execution +context. For instance, each CPU number returned by +.BR sched_getcpu () +is a CPU. The current CPU means to the CPU on which the registered thread is +running. + +Restartable sequences are atomic with respect to preemption (making it +atomic with respect to other threads running on the same CPU), as well +as signal delivery (user-space execution contexts nested over the same +thread). They either complete atomically with respect to preemption on +the current CPU and signal delivery, or they are aborted. + +Restartable sequences are suited for update operations on per-cpu data. + +Restartable sequences can be used on data structures shared between threads +within a process, and on data structures shared between threads across +different processes. + +.PP +Some examples of operations that can be accelerated or improved +by this ABI: +.IP \[bu] 2 +Memory allocator per-cpu free-lists, +.IP \[bu] 2 +Querying the current CPU number, +.IP \[bu] 2 +Incrementing per-CPU counters, +.IP \[bu] 2 +Modifying data protected by per-CPU spinlocks, +.IP \[bu] 2 +Inserting/removing elements in per-CPU linked-lists, +.IP \[bu] 2 +Writing/reading per-CPU ring buffers content. +.IP \[bu] 2 +Accurately reading performance monitoring unit counters +with respect to thread migration. + +.PP +Restartable sequences must not perform system calls. Doing so may result +in termination of the process by a segmentation fault. + +.PP +The +.I rseq +argument is a pointer to the thread-local rseq structure to be shared +between kernel and user-space. + +.PP +The layout of +.B struct rseq +is as follows: +.TP +.B Structure alignment +This structure is aligned on 32-byte boundary. +.TP +.B Structure size +This structure is fixed-size (32 bytes). Its size is passed as parameter to the +rseq system call. +.PP +.in +8n +.EX +struct rseq { + __u32 cpu_id_start; + __u32 cpu_id; + union { + /* Edited out for conciseness. [...] */ + } rseq_cs; + __u32 flags; +} __attribute__((aligned(32))); +.EE +.TP +.B Fields + +.TP +.in +4n +.I cpu_id_start +Optimistic cache of the CPU number on which the registered thread is +running. Its value is guaranteed to always be a possible CPU number, +even when rseq is not registered. Its value should always be confirmed by +reading the cpu_id field before user-space performs any side-effect (e.g. +storing to memory). + +This field is an optimistic cache in the sense that it is always +guaranteed to hold a valid CPU number in the range [ 0 .. +nr_possible_cpus - 1 ]. It can therefore be loaded by user-space and +used as an offset in per-cpu data structures without having to +check whether its value is within the valid bounds compared to the +number of possible CPUs in the system. + +Initialized by user-space to a possible CPU number (e.g., 0), updated +by the kernel for threads registered with rseq. + +For user-space applications executed on a kernel without rseq support, +the cpu_id_start field stays initialized at 0, which is indeed a valid +CPU number. It is therefore valid to use it as an offset in per-cpu data +structures, and only validate whether it's actually the current CPU +number by comparing it with the cpu_id field within the rseq critical +section. If the kernel does not provide rseq support, that cpu_id field +stays initialized at -1, so the comparison always fails, as intended. + +It is up to user-space to implement a fall-back mechanism for scenarios where +rseq is not available. +.in +.TP +.in +4n +.I cpu_id +Cache of the CPU number on which the registered thread is running. Initialized +by user-space to -1, updated by the kernel for threads registered with rseq. +.in +.TP +.in +4n +.I rseq_cs +The rseq_cs field is a pointer to a struct rseq_cs. Is is NULL when no +rseq assembly block critical section is active for the registered thread. +Setting it to point to a critical section descriptor (struct rseq_cs) +marks the beginning of the critical section. + +Initialized by user-space to NULL. + +Updated by user-space, which sets the address of the currently +active rseq_cs at the beginning of assembly instruction sequence +block, and set to NULL by the kernel when it restarts an assembly +instruction sequence block, as well as when the kernel detects that +it is preempting or delivering a signal outside of the range +targeted by the rseq_cs. Also needs to be set to NULL by user-space +before reclaiming memory that contains the targeted struct rseq_cs. + +Read and set by the kernel. +.in +.TP +.in +4n +.I flags +Flags indicating the restart behavior for the registered thread. This is +mainly used for debugging purposes. Can be a combination of: +.IP \[bu] +RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT: Inhibit instruction sequence block restart +on preemption for this thread. +.IP \[bu] +RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL: Inhibit instruction sequence block restart +on signal delivery for this thread. +.IP \[bu] +RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE: Inhibit instruction sequence block restart +on migration for this thread. +.in + +Initialized by user-space, used by the kernel. + +.PP +The layout of +.B struct rseq_cs +version 0 is as follows: +.TP +.B Structure alignment +This structure is aligned on 32-byte boundary. +.TP +.B Structure size +This structure has a fixed size of 32 bytes. +.PP +.in +8n +.EX +struct rseq_cs { + __u32 version; + __u32 flags; + __u64 start_ip; + __u64 post_commit_offset; + __u64 abort_ip; +} __attribute__((aligned(32))); +.EE +.TP +.B Fields + +.TP +.in +4n +.I version +Version of this structure. Should be initialized to 0. +.in +.TP +.in +4n +.I flags +Flags indicating the restart behavior of this structure. Can be a combination +of: +.IP \[bu] +RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT: Inhibit instruction sequence block restart +on preemption for this critical section. +.IP \[bu] +RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL: Inhibit instruction sequence block restart +on signal delivery for this critical section. +.IP \[bu] +RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE: Inhibit instruction sequence block restart +on migration for this critical section. +.TP +.in +4n +.I start_ip +Instruction pointer address of the first instruction of the sequence of +consecutive assembly instructions. +.in +.TP +.in +4n +.I post_commit_offset +Offset (from start_ip address) of the address after the last instruction +of the sequence of consecutive assembly instructions. +.in +.TP +.in +4n +.I abort_ip +Instruction pointer address where to move the execution flow in case of +abort of the sequence of consecutive assembly instructions. +.in + +.PP +The +.I rseq_len +argument is the size of the +.I struct rseq +to register. + +.PP +The +.I flags +argument is 0 for registration, and +.IR RSEQ_FLAG_UNREGISTER +for unregistration. + +.PP +The +.I sig +argument is the 32-bit signature to be expected before the abort +handler code. + +.PP +A single library per process should keep the rseq structure in a +thread-local storage variable. +The +.I cpu_id +field should be initialized to -1, and the +.I cpu_id_start +field should be initialized to a possible CPU value (typically 0). + +.PP +Each thread is responsible for registering and unregistering its rseq +structure. No more than one rseq structure address can be registered +per thread at a given time. + +.PP +Reclaim of rseq object's memory must only be done after either an +explicit rseq unregistration is performed or after the thread exits. + +.PP +In a typical usage scenario, the thread registering the rseq +structure will be performing loads and stores from/to that structure. It +is however also allowed to read that structure from other threads. +The rseq field updates performed by the kernel provide relaxed atomicity +semantics (atomic store, without memory ordering), which guarantee that other +threads performing relaxed atomic reads (atomic load, without memory ordering) +of the cpu number cache will always observe a consistent value. + +.SH RETURN VALUE +A return value of 0 indicates success. On error, \-1 is returned, and +.I errno +is set appropriately. + +.SH ERRORS +.TP +.B EINVAL +Either +.I flags +contains an invalid value, or +.I rseq +contains an address which is not appropriately aligned, or +.I rseq_len +contains an incorrect size. +.TP +.B ENOSYS +The +.BR rseq () +system call is not implemented by this kernel. +.TP +.B EFAULT +.I rseq +is an invalid address. +.TP +.B EBUSY +Restartable sequence is already registered for this thread. +.TP +.B EPERM +The +.I sig +argument on unregistration does not match the signature received +on registration. + +.SH VERSIONS +The +.BR rseq () +system call was added in Linux 4.18. + +.SH CONFORMING TO +.BR rseq () +is Linux-specific. + +.in +.SH SEE ALSO +.BR sched_getcpu (3) , +.BR membarrier (2) \ No newline at end of file diff --git a/23-rseq/rseq.c b/23-rseq/rseq.c new file mode 100644 index 0000000000000000000000000000000000000000..18ab5607de04b39daf92885f0f187a2e76c04e33 --- /dev/null +++ b/23-rseq/rseq.c @@ -0,0 +1,243 @@ +#define _GNU_SOURCE +#include <pthread.h> +#include <unistd.h> +#include <inttypes.h> +#include <stdlib.h> +#include <stdio.h> +#include <stdint.h> +#include <sys/syscall.h> +#include <stdatomic.h> +#include <string.h> +#include <sys/sysinfo.h> +#include <stdbool.h> +#include <malloc.h> + +#define die(msg) do { perror(msg); exit(EXIT_FAILURE); } while(0) + +// With modern glibc versions (>2.35), the glibc already registers +// a rseq area for each thread that is started with +// pthread_create. In that case, we derive this pointer from the +// thread_pointer(). For details, see +// https://www.gnu.org/software/libc/manual/html_node/Restartable-Sequences.html +#if __has_include ("sys/rseq.h") +#include <sys/rseq.h> + +#if RSEQ_SIG != 0x53053053 +#error "glibc defined RSEQ_SIG differently" +#endif + +static struct rseq * rseq_register() { + return __builtin_thread_pointer() + __rseq_offset; +} + +#else +#include <linux/rseq.h> +#define RSEQ_SIG 0x53053053 + +// The rseq(2) syscall has no glibc wrapper. Therefore, we define our +// own. Please run `make man` to see the man page rseq(2). +int sys_rseq(struct rseq * rseq, uint32_t rseq_len, int flags, uint32_t sig) { + return syscall(SYS_rseq, rseq, rseq_len, flags, RSEQ_SIG); +} + +struct rseq *rseq_register() { + struct rseq *ret = memalign(sizeof(struct rseq), sizeof(struct rseq)); + memset(ret, 0, sizeof(struct rseq)); + ret->cpu_id_start = -1; + ret->cpu_id = -1; + if (sys_rseq(ret, sizeof(struct rseq), 0, 0) < 0) + die("rseq"); + return ret; +} + +#endif + + + +// This data structure is exactly one cache-line wide (assuming that a +// cache line is 64 bytes). Thereby, we can allocate an array of +// cpu-local counters, where each CPU only operates on a single cache +// line. Thereby, we can avoid most side effects of cache-line +// transfers. +struct cacheline { + union { + char data[64]; + struct { + uint64_t counter; + pthread_mutex_t mutex; // Used in the lock variant + }; + }; +}; + +// We will define multiple operation_t functions that all implement +// the same behavior: They increment a cpu-local counter by 1. +typedef int (*operation_t)(struct rseq *_, struct cacheline *); + +// The simplest variant of a CPU-local counter is to get the cpuid +// with getcpu() and increment the counter. However, due to the +// read-update-write cycle, this variant is racy and will produce +// incorrect results. +int operation_regular(struct rseq*_, struct cacheline *counters) { + unsigned int cpu_id; + + getcpu(&cpu_id, NULL); + counters[cpu_id].counter += 1; + + return 0; +} + +// A correct, but slow variant uses the cache-line--local pthread +// mutex to lock the counter for the time of the operation. +int operation_lock(struct rseq*_, struct cacheline *counters) { + unsigned int cpu_id; + + getcpu(&cpu_id, NULL); + + pthread_mutex_lock(&counters[cpu_id].mutex); + counters[cpu_id].counter += 1; + pthread_mutex_unlock(&counters[cpu_id].mutex); + + return 0; +} + +// Variant that uses getcpu() + atomic_fetch_add +int operation_atomic(struct rseq* _, struct cacheline *counters) { + // FIXME: Implement variant + + return 0; +} + +// Variant without getcpu: Like operation_atomic, but uses the +// restartable sequence to retrieve the cpu id. +// Please look at /usr/include/linux/rseq.h for the documentation of struct rseq +int operation_rseq_atomic(struct rseq* rs, struct cacheline *counters) { + // FIXME: Implement variant + + return 0; +} + + +// Variant that uses no atomic operations and fully relies on rseq +// This variant is implemented in assembler (see rseq.S) +extern int operation_rseq(struct rseq *, struct cacheline*); +// FIXME: Implement counter_rseq in rseq.S + + +//////////////////////////////////////////////////////////////// +// The Benchmarking code +// +// We start NTHREADS threads and each thread executes +// ROUNDS_PER_THREAD cpu-local increments + +int ROUNDS_PER_THREAD = 50000000; + +struct thread_args { + operation_t operation; + struct cacheline *counters; +}; + + +void* thread_handler(void* data) { + struct thread_args *args = data; + + // Register rseq area or use glibc's rseq + struct rseq *rseq = rseq_register(); + printf("rseq: %p\n", rseq); + + // Execute the given operation ROUNDS_PER_THREAD times and count + // the number of aborts (only != 0 for rseq) + uint64_t aborts = 0; + for (uint64_t i = 0; i < ROUNDS_PER_THREAD; i++) { + aborts += args->operation(rseq, args->counters); + } + + // Return the number of rseq aborts + return (void*) aborts; +} + +// Print usage and exit. +static void usage(char *argv0) { + fprintf(stderr, "usage: %s <threads> <regular|lock|getcpu-atomic|rseq-atomic|rseq> [rounds]\n", argv0); + exit(EXIT_FAILURE); +} + +int main(int argc, char *argv[]) { + // Parameter Parsing. This is boring + if (argc < 3) usage(argv[0]); + if (argc == 4) + ROUNDS_PER_THREAD *= atoi(argv[3]); + + int CPUS = get_nprocs(); + int NTHREADS = atoi(argv[1]); + char *MODE = argv[2]; + + struct thread_args args; + if (!strcmp(MODE, "rseq")) args.operation = operation_rseq; + else if (!strcmp(MODE, "getcpu-atomic")) args.operation = operation_atomic; + else if (!strcmp(MODE, "rseq-atomic")) args.operation = operation_rseq_atomic; + else if (!strcmp(MODE, "regular")) args.operation = operation_regular; + else if (!strcmp(MODE, "lock")) args.operation = operation_lock; + else usage(argv[0]); + + // Initialize the CPU-local counters. Each CPU gets an struct + // cache-line on its own. We use aligned_alloc(3) to get + // cache-line-aligned memory from the allocator. + + args.counters = aligned_alloc(sizeof(struct cacheline), CPUS * sizeof(struct cacheline)); + if (!args.counters) die("calloc"); + + + // Initialize locks for the lock variant + for (uint32_t i = 0; i < CPUS; i++) { + pthread_mutex_init(&args.counters[i].mutex, NULL); + } + + + // The actual benchmarking code + //////////////////////////////////////////////////////////////// + struct timespec start, end; + // Start Time. We use the CLOCK_PROCESS_CPUTIME_ID to get the + // number of CPU-seconds spent. + if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start) < 0) + die("clock_gettime"); + + // Create NTHREADS threads + pthread_t threads[NTHREADS]; + for (uint32_t i = 0; i < NTHREADS; i++) { + pthread_create(&threads[i], NULL, thread_handler, (void*)&args); + } + + // Wait for all threads to complete and accumulate the number of aborts + uint64_t aborts = 0; + for (uint32_t i = 0; i < NTHREADS; i++) { + uint64_t thread_aborts; + pthread_join(threads[i], (void**)&thread_aborts); + aborts += thread_aborts; + } + + // End Time + if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end) < 0) + die("clock_gettime"); + + // Calculate the time delta between both points in time. + double delta = end.tv_sec - start.tv_sec; + delta += (end.tv_nsec - start.tv_nsec) / 1e9; + + // Print out the cpu-local counters. With this output and a low + // number of threads you can see the thread migration. + uint64_t sum = 0; + for (uint32_t i = 0; i < CPUS; i++) { + fprintf(stderr, "counter[cpu=%d] = %ld\n", i, args.counters[i].counter); + sum += args.counters[i].counter; + } + + // Print out the result. We also check that the threads actually + // counted correctly (state) + printf("mode=%s threads=%d sum=%ld state=%s aborts=%ld cputime=%fs per_increment=%fns\n", + MODE, NTHREADS, + sum, (sum % ROUNDS_PER_THREAD) == 0 ? "ok" : "fail", + aborts, + delta, // total cpu time that was spent + delta * 1e9 / sum // nanoseconds per increment + ); +}