Skip to content
Snippets Groups Projects
Commit e7055f04 authored by Glauber Costa's avatar Glauber Costa Committed by Pekka Enberg
Browse files

memcpy: improve performance for x86's memcpy


According to reality, the idea that rep movsb is the preferred way to implement
memcpy for x86 in the presence of the rep_good flag is false. This
implementation performs better in the misc-memcpy benchmark for pretty much all
sizes.

I have also tested a simple loop with byte-by-byte copy, and the duff's
mechanism. For the Duff, I am seeing a weird bug when it is implemented
together with our memcpy. But It is off course possible to implement it up to
256 separately for analysis, which is what I did.

What can be seen in the results below is that all versions start faster than
rep movsb for very small objects, but the loop starts to be slower for sizes as
low as 32-bytes.  Duff is slower for 64-byte elements, but this patch is faster
for all sizes measured.  We can copy 64i bytes in 5.6ns, 128 bytes in 7.7ns and
256 bytes in 13.3ns while the original numbers would be 11ns, 11ns, and 13.8
ns.

Balloon Safety:

Balloon memcpys are 128Mb in size. Even for partial copy, they are at least in
the kb range. So I am not expecting any funny interaction with this, nor
anticipating the need to insert fixups here.

Full Results:

Original
========
4,11.066000,13.217000,11.313369,0.527048
8,29.427999,31.054001,29.797934,0.540056
16,11.065000,11.147000,11.088465,0.030663
32,11.065000,11.199000,11.093401,0.043994
64,11.065000,11.508000,11.115365,0.092626
128,12.866000,13.137000,12.914132,0.066646
256,13.896000,14.252000,13.937533,0.067841
512,15.955000,16.304001,16.006964,0.073594
1024,20.072001,20.301001,20.122099,0.052627
2048,28.306999,28.577999,28.377703,0.063443
4096,44.785999,45.087002,44.899033,0.068806
8192,77.783997,78.370003,77.918457,0.113472
16384,150.259003,183.679001,158.534668,5.947755
32768,1049.886963,1053.098022,1051.364380,0.851499

Loop
====
4,3.152000,3.734000,3.347033,0.185811
8,4.467000,5.336000,4.936766,0.221336
16,6.655000,8.262000,7.695767,0.377303
32,19.788000,20.438000,19.960333,0.221289
64,25.996000,29.969999,29.217133,0.828447
128,44.501999,45.562000,45.335640,0.244315
256,85.459000,95.369003,91.925179,3.409483
512,14.925000,15.014000,14.939700,0.024197
1024,19.042999,19.143000,19.060701,0.028286
2048,27.277000,27.386000,27.306065,0.035528
4096,43.750000,43.902000,43.789631,0.038810
8192,76.699997,76.872002,76.769691,0.040407
16384,149.393997,164.602005,157.051132,4.324330
32768,1045.287964,1047.580933,1046.380493,0.617742

Duff
====
4,3.602000,4.120000,3.722167,0.163732
8,4.631000,4.725000,4.643835,0.028509
16,7.205000,7.316000,7.213567,0.022538
32,11.838000,12.613000,12.032168,0.285366
64,21.681000,22.173000,21.754402,0.088584
128,41.331001,41.651001,41.452267,0.066087
256,80.431000,80.927002,80.737724,0.106475

This patch
==========
4,3.602000,3.895000,3.636133,0.071126
8,3.602000,3.679000,3.607600,0.015768
16,3.859000,3.981000,3.875433,0.032632
32,4.888000,4.994000,4.899767,0.025539
64,5.663000,6.404000,6.001000,0.158665
128,7.737000,8.168000,7.881701,0.156874
256,13.301000,17.438999,14.937235,0.880874
512,14.925000,15.226000,14.975132,0.072150
1024,19.042999,19.412001,19.099068,0.095145
2048,27.278000,32.022999,27.617165,1.007376
4096,43.750000,44.146000,43.844494,0.094062
8192,76.698997,83.873001,77.137794,1.266063
16384,153.483994,168.636002,160.516830,3.837175
32768,1047.878052,1068.301025,1052.600586,4.441750

Signed-off-by: default avatarGlauber Costa <glommer@gmail.com>
Signed-off-by: default avatarPekka Enberg <penberg@cloudius-systems.com>
parent e507e3b6
No related branches found
No related tags found
No related merge requests found
...@@ -99,24 +99,62 @@ repmovsb(void *__restrict &dest, const void *__restrict &src, size_t &n) ...@@ -99,24 +99,62 @@ repmovsb(void *__restrict &dest, const void *__restrict &src, size_t &n)
: "+D"(dest), "+S"(src), "+c"(n) : : "memory"); : "+D"(dest), "+S"(src), "+c"(n) : : "memory");
} }
static inline void small_memcpy(void *dest, const void *src, size_t n)
{
size_t qty = n / 8;
unsigned long *to_8 = (unsigned long *)dest;
unsigned long *from_8 = (unsigned long *)src;
while (qty--) {
*to_8++ = *from_8++;
}
qty = n % 8;
unsigned int *to_4 = (unsigned int *)to_8;
unsigned int *from_4 = (unsigned int *)from_8;
if (qty / 4) {
*to_4++ = *from_4++;
}
qty = qty % 4;
unsigned short *to_2 = (unsigned short *)to_4;
unsigned short *from_2 = (unsigned short *)from_4;
if (qty / 2) {
*to_2++ = *from_2++;
}
unsigned char *to = (unsigned char *)to_2;
unsigned char *from = (unsigned char *)from_2;
if (qty % 2) {
*to++ = *from++;
}
}
extern "C" extern "C"
void *memcpy_repmov_old(void *__restrict dest, const void *__restrict src, size_t n) void *memcpy_repmov_old(void *__restrict dest, const void *__restrict src, size_t n)
{ {
auto ret = dest; auto ret = dest;
auto nw = n / 8; if (n <= 256) {
auto nb = n & 7; small_memcpy(dest, src, n);
} else {
repmovsq(dest, src, nw); auto nw = n / 8;
repmovsb(dest, src, nb); auto nb = n & 7;
repmovsq(dest, src, nw);
repmovsb(dest, src, nb);
}
return ret; return ret;
} }
extern "C" extern "C"
void *memcpy_repmov(void *__restrict dest, const void *__restrict src, size_t n) void *memcpy_repmov(void *__restrict dest, const void *__restrict src, size_t n)
{ {
auto ret = dest; auto ret = dest;
repmovsb(dest, src, n);
if (n <= 256) {
small_memcpy(dest, src, n);
} else {
repmovsb(dest, src, n);
}
return ret; return ret;
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment