Tuesday, December 10, 2024

Why C++'s string_view is slower.

Why C++'s string_view is slower.


C++17's string_view is meant to replace parameters where you used to have "const std::string &str".


However, it's actually slower than "const std::string &str" because it requires a creation of an object on the stack (in assembler), which is the string_view object.


Even when it's optimised with -O3 (gcc flag), it's still slower.



test1.cpp:


```


#include <string>


int func(const std::string &str)

{

for (int i = 0; i < str.size(); i++) {

if (str[i] == ' ')

return i;

}


return 0;

}


int main()

{

std::string blah = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";


int count = 0;


for (int i = 0; i < 10000000; i++) {

count += func(blah);

}


printf("%d\n", count);

}



```



test2.cpp:


```


cat test2.cpp

#include <string>


int func(std::string_view str)

{

for (int i = 0; i < str.size(); i++) {

if (str[i] == ' ')

return i;

}


return 0;

}


int main()

{

std::string blah = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";


int count = 0;



for (int i = 0; i < 10000000; i++) {

count += func(blah);

}


printf("%d\n", count);


}


```



func() is a dummy, constant-time, operation, so that it doesn't get optimised out by the compiler as it returns a value that's needed by the printf("%d\n", count); call later on.



test1.cpp's time taken (the first run is non-optimised):


```


$ g++ -o test test.cpp

$ time ./test

0


real 0m1.080s

user 0m1.075s

sys 0m0.005s



$ g++ -o test test.cpp -O3

$ time ./test

0


real 0m0.221s

user 0m0.220s

sys 0m0.001s



```



test2.cpp's time taken (the first run is non-optimised):


```


$ g++ -o test2 test2.cpp

$ time ./test2

0


real 0m1.140s

user 0m1.135s

sys 0m0.005s

$ g++ -o test2 test2.cpp -O3

$ time ./test2

0


real 0m0.227s

user 0m0.223s

sys 0m0.004s


```



When comparing the two assembly outputs (non-optimised):


test1.cpp:


```


0x00000000000014d2 <+94>: lea -0x40(%rbp),%rax

0x00000000000014d6 <+98>: mov %rax,%rdi

0x00000000000014d9 <+101>: call 0x1409 <_Z4funcRKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE>

0x00000000000014de <+106>: add %eax,-0x48(%rbp)

0x00000000000014e1 <+109>: addl $0x1,-0x44(%rbp)

0x00000000000014e5 <+113>: cmpl $0x98967f,-0x44(%rbp)

0x00000000000014ec <+120>: jle 0x14d2 <main()+94>

0x00000000000014ee <+122>: mov -0x48(%rbp),%eax



```


test2.cpp:


```


0x00000000000014af <+78>: movl $0x0,-0x48(%rbp)

0x00000000000014b6 <+85>: movl $0x0,-0x44(%rbp)

0x00000000000014bd <+92>: jmp 0x14e6 <main()+133>

0x00000000000014bf <+94>: lea -0x40(%rbp),%rax

0x00000000000014c3 <+98>: mov %rax,%rdi

0x00000000000014c6 <+101>: call 0x1250 <_ZNKSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEcvSt17basic_string_viewIcS2_EEv@plt>

0x00000000000014cb <+106>: mov %rax,%rcx

0x00000000000014ce <+109>: mov %rdx,%rbx

0x00000000000014d1 <+112>: mov %rdx,%rax

0x00000000000014d4 <+115>: mov %rcx,%rdi

0x00000000000014d7 <+118>: mov %rax,%rsi

0x00000000000014da <+121>: call 0x13e9 <_Z4funcSt17basic_string_viewIcSt11char_traitsIcEE>

0x00000000000014df <+126>: add %eax,-0x48(%rbp)

0x00000000000014e2 <+129>: addl $0x1,-0x44(%rbp)

0x00000000000014e6 <+133>: cmpl $0x98967f,-0x44(%rbp)



```


In test2.cpp it calls the string_view constructor, and then calls func(std::string_view); whereas in test1.cpp it just calls func(const std::string &str).


In the optimised versions though, it seems abit different. It seems to have hardcoded the string comparison function "func" into main(), with intel optimised instruction set. "func" itself is never called.


test2 optimised (-O3) assembly:


```


Dump of assembler code for function main():

Address range 0x1100 to 0x1237:

0x0000000000001100 <+0>: endbr64

0x0000000000001104 <+4>: push %rbp

0x0000000000001105 <+5>: xor %edx,%edx

0x0000000000001107 <+7>: push %rbx

0x0000000000001108 <+8>: sub $0x48,%rsp

0x000000000000110c <+12>: mov %fs:0x28,%rax

0x0000000000001115 <+21>: mov %rax,0x38(%rsp)

0x000000000000111a <+26>: xor %eax,%eax

0x000000000000111c <+28>: lea 0x10(%rsp),%rdi

0x0000000000001121 <+33>: lea 0x8(%rsp),%rsi

0x0000000000001126 <+38>: movq $0x3e,0x8(%rsp)

0x000000000000112f <+47>: lea 0x20(%rsp),%rbx

0x0000000000001134 <+52>: mov %rbx,0x10(%rsp)

0x0000000000001139 <+57>: call 0x10d0 <_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_createERmm@plt>

0x000000000000113e <+62>: mov 0x8(%rsp),%rdx

0x0000000000001143 <+67>: movdqa 0xec5(%rip),%xmm0 # 0x2010

0x000000000000114b <+75>: xor %r8d,%r8d

0x000000000000114e <+78>: movabs $0x333231307a797877,%rdi

0x0000000000001158 <+88>: mov %rax,0x10(%rsp)

0x000000000000115d <+93>: mov %rdx,0x20(%rsp)

0x0000000000001162 <+98>: mov $0x3938,%edx

0x0000000000001167 <+103>: movups %xmm0,(%rax)

0x000000000000116a <+106>: movdqa 0xeae(%rip),%xmm0 # 0x2020

0x0000000000001172 <+114>: mov %rdi,0x30(%rax)

0x0000000000001176 <+118>: mov $0x989680,%edi

0x000000000000117b <+123>: movups %xmm0,0x10(%rax)

0x000000000000117f <+127>: movdqa 0xea9(%rip),%xmm0 # 0x2030

0x0000000000001187 <+135>: mov %dx,0x3c(%rax)

0x000000000000118b <+139>: mov 0x10(%rsp),%rdx

0x0000000000001190 <+144>: movl $0x37363534,0x38(%rax)

0x0000000000001197 <+151>: movups %xmm0,0x20(%rax)

0x000000000000119b <+155>: mov 0x8(%rsp),%rax

0x00000000000011a0 <+160>: mov %rax,0x18(%rsp)

0x00000000000011a5 <+165>: movb $0x0,(%rdx,%rax,1)

0x00000000000011a9 <+169>: mov 0x18(%rsp),%rdx

0x00000000000011ae <+174>: mov 0x10(%rsp),%rsi

0x00000000000011b3 <+179>: nopl 0x0(%rax,%rax,1)

0x00000000000011b8 <+184>: test %rdx,%rdx

0x00000000000011bb <+187>: je 0x11da <main()+218>

0x00000000000011bd <+189>: xor %eax,%eax

0x00000000000011bf <+191>: jmp 0x11d1 <main()+209>

0x00000000000011c1 <+193>: nopl 0x0(%rax)

0x00000000000011c8 <+200>: add $0x1,%rax

0x00000000000011cc <+204>: cmp %rdx,%rax

0x00000000000011cf <+207>: je 0x11da <main()+218>

0x00000000000011d1 <+209>: cmpb $0x20,(%rsi,%rax,1)

0x00000000000011d5 <+213>: jne 0x11c8 <main()+200>

0x00000000000011d7 <+215>: add %eax,%r8d

0x00000000000011da <+218>: sub $0x1,%edi

0x00000000000011dd <+221>: jne 0x11b8 <main()+184>

0x00000000000011df <+223>: mov %r8d,%edx

0x00000000000011e2 <+226>: lea 0xe1b(%rip),%rsi # 0x2004

0x00000000000011e9 <+233>: mov $0x1,%edi

0x00000000000011ee <+238>: xor %eax,%eax

0x00000000000011f0 <+240>: call 0x1090 <__printf_chk@plt>

0x00000000000011f5 <+245>: mov 0x10(%rsp),%rdi

--Type <RET> for more, q to quit, c to continue without paging--

0x00000000000011fa <+250>: cmp %rbx,%rdi

0x00000000000011fd <+253>: je 0x120d <main()+269>

0x00000000000011ff <+255>: mov 0x20(%rsp),%rax

0x0000000000001204 <+260>: lea 0x1(%rax),%rsi

0x0000000000001208 <+264>: call 0x10a0 <_ZdlPvm@plt>

0x000000000000120d <+269>: mov 0x38(%rsp),%rax

0x0000000000001212 <+274>: sub %fs:0x28,%rax

0x000000000000121b <+283>: jne 0x1226 <main()+294>

0x000000000000121d <+285>: add $0x48,%rsp

0x0000000000001221 <+289>: xor %eax,%eax

0x0000000000001223 <+291>: pop %rbx

0x0000000000001224 <+292>: pop %rbp

0x0000000000001225 <+293>: ret

0x0000000000001226 <+294>: call 0x10b0 <__stack_chk_fail@plt>

0x000000000000122b <+299>: endbr64

0x000000000000122f <+303>: mov %rax,%rbp

0x0000000000001232 <+306>: jmp 0x10e0 <main.cold>

Address range 0x10e0 to 0x1100:

0x00000000000010e0 <-32>: mov 0x10(%rsp),%rdi

0x00000000000010e5 <-27>: cmp %rbx,%rdi

0x00000000000010e8 <-24>: je 0x10f8 <main()-8>

0x00000000000010ea <-22>: mov 0x20(%rsp),%rax

0x00000000000010ef <-17>: lea 0x1(%rax),%rsi

0x00000000000010f3 <-13>: call 0x10a0 <_ZdlPvm@plt>

0x00000000000010f8 <-8>: mov %rbp,%rdi

0x00000000000010fb <-5>: call 0x10c0 <_Unwind_Resume@plt>

End of assembler dump.

(gdb) disas func

Dump of assembler code for function _Z4funcSt17basic_string_viewIcSt11char_traitsIcEE:

0x0000000000001330 <+0>: endbr64

0x0000000000001334 <+4>: test %rdi,%rdi

0x0000000000001337 <+7>: je 0x1360 <_Z4funcSt17basic_string_viewIcSt11char_traitsIcEE+48>

0x0000000000001339 <+9>: xor %eax,%eax

0x000000000000133b <+11>: jmp 0x1349 <_Z4funcSt17basic_string_viewIcSt11char_traitsIcEE+25>

0x000000000000133d <+13>: nopl (%rax)

0x0000000000001340 <+16>: add $0x1,%rax

0x0000000000001344 <+20>: cmp %rax,%rdi

0x0000000000001347 <+23>: je 0x1360 <_Z4funcSt17basic_string_viewIcSt11char_traitsIcEE+48>

0x0000000000001349 <+25>: cmpb $0x20,(%rsi,%rax,1)

0x000000000000134d <+29>: mov %eax,%r8d

0x0000000000001350 <+32>: jne 0x1340 <_Z4funcSt17basic_string_viewIcSt11char_traitsIcEE+16>

0x0000000000001352 <+34>: mov %r8d,%eax

0x0000000000001355 <+37>: ret

0x0000000000001356 <+38>: cs nopw 0x0(%rax,%rax,1)

0x0000000000001360 <+48>: xor %r8d,%r8d

0x0000000000001363 <+51>: mov %r8d,%eax

0x0000000000001366 <+54>: ret



```



So we can't compare the optimised versions. But even so, the optimised versions of test2 aren't actually faster. That's the weird thing. No string_view is being generated (constructor for string_view isn't being called), but the hardcoded comparison seems to be slower?



Conclusion


Don't use string_view, if you care about speed, as it's not actually faster.


There's just some stuff that never gets benchmarked, but cargo-cult coders will use relentlessly.


I have no idea where C++ standards are headed, but it's not necessarily going in the right direction, in my opinion.



Summary of wavelets

Summary of wavelets https://en.wikipedia.org/wiki/Wavelet_transform https://en.wikipedia.org/wiki/Discrete_wavelet_transform Read the 2nd UR...