Why C++'s string_view is slower.
C++17's string_view is meant to replace parameters where you used to have "const std::string &str".
However, it's actually slower than "const std::string &str" because it requires a creation of an object on the stack (in assembler), which is the string_view object.
Even when it's optimised with -O3 (gcc flag), it's still slower.
test1.cpp:
```
#include <string>
int func(const std::string &str)
{
for (int i = 0; i < str.size(); i++) {
if (str[i] == ' ')
return i;
}
return 0;
}
int main()
{
std::string blah = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
int count = 0;
for (int i = 0; i < 10000000; i++) {
count += func(blah);
}
printf("%d\n", count);
}
```
test2.cpp:
```
cat test2.cpp
#include <string>
int func(std::string_view str)
{
for (int i = 0; i < str.size(); i++) {
if (str[i] == ' ')
return i;
}
return 0;
}
int main()
{
std::string blah = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
int count = 0;
for (int i = 0; i < 10000000; i++) {
count += func(blah);
}
printf("%d\n", count);
}
```
func() is a dummy, constant-time, operation, so that it doesn't get optimised out by the compiler as it returns a value that's needed by the printf("%d\n", count); call later on.
test1.cpp's time taken (the first run is non-optimised):
```
$ g++ -o test test.cpp
$ time ./test
0
real 0m1.080s
user 0m1.075s
sys 0m0.005s
$ g++ -o test test.cpp -O3
$ time ./test
0
real 0m0.221s
user 0m0.220s
sys 0m0.001s
```
test2.cpp's time taken (the first run is non-optimised):
```
$ g++ -o test2 test2.cpp
$ time ./test2
0
real 0m1.140s
user 0m1.135s
sys 0m0.005s
$ g++ -o test2 test2.cpp -O3
$ time ./test2
0
real 0m0.227s
user 0m0.223s
sys 0m0.004s
```
When comparing the two assembly outputs (non-optimised):
test1.cpp:
```
0x00000000000014d2 <+94>: lea -0x40(%rbp),%rax
0x00000000000014d6 <+98>: mov %rax,%rdi
0x00000000000014d9 <+101>: call 0x1409 <_Z4funcRKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE>
0x00000000000014de <+106>: add %eax,-0x48(%rbp)
0x00000000000014e1 <+109>: addl $0x1,-0x44(%rbp)
0x00000000000014e5 <+113>: cmpl $0x98967f,-0x44(%rbp)
0x00000000000014ec <+120>: jle 0x14d2 <main()+94>
0x00000000000014ee <+122>: mov -0x48(%rbp),%eax
```
test2.cpp:
```
0x00000000000014af <+78>: movl $0x0,-0x48(%rbp)
0x00000000000014b6 <+85>: movl $0x0,-0x44(%rbp)
0x00000000000014bd <+92>: jmp 0x14e6 <main()+133>
0x00000000000014bf <+94>: lea -0x40(%rbp),%rax
0x00000000000014c3 <+98>: mov %rax,%rdi
0x00000000000014c6 <+101>: call 0x1250 <_ZNKSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEcvSt17basic_string_viewIcS2_EEv@plt>
0x00000000000014cb <+106>: mov %rax,%rcx
0x00000000000014ce <+109>: mov %rdx,%rbx
0x00000000000014d1 <+112>: mov %rdx,%rax
0x00000000000014d4 <+115>: mov %rcx,%rdi
0x00000000000014d7 <+118>: mov %rax,%rsi
0x00000000000014da <+121>: call 0x13e9 <_Z4funcSt17basic_string_viewIcSt11char_traitsIcEE>
0x00000000000014df <+126>: add %eax,-0x48(%rbp)
0x00000000000014e2 <+129>: addl $0x1,-0x44(%rbp)
0x00000000000014e6 <+133>: cmpl $0x98967f,-0x44(%rbp)
```
In test2.cpp it calls the string_view constructor, and then calls func(std::string_view); whereas in test1.cpp it just calls func(const std::string &str).
In the optimised versions though, it seems abit different. It seems to have hardcoded the string comparison function "func" into main(), with intel optimised instruction set. "func" itself is never called.
test2 optimised (-O3) assembly:
```
Dump of assembler code for function main():
Address range 0x1100 to 0x1237:
0x0000000000001100 <+0>: endbr64
0x0000000000001104 <+4>: push %rbp
0x0000000000001105 <+5>: xor %edx,%edx
0x0000000000001107 <+7>: push %rbx
0x0000000000001108 <+8>: sub $0x48,%rsp
0x000000000000110c <+12>: mov %fs:0x28,%rax
0x0000000000001115 <+21>: mov %rax,0x38(%rsp)
0x000000000000111a <+26>: xor %eax,%eax
0x000000000000111c <+28>: lea 0x10(%rsp),%rdi
0x0000000000001121 <+33>: lea 0x8(%rsp),%rsi
0x0000000000001126 <+38>: movq $0x3e,0x8(%rsp)
0x000000000000112f <+47>: lea 0x20(%rsp),%rbx
0x0000000000001134 <+52>: mov %rbx,0x10(%rsp)
0x0000000000001139 <+57>: call 0x10d0 <_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_createERmm@plt>
0x000000000000113e <+62>: mov 0x8(%rsp),%rdx
0x0000000000001143 <+67>: movdqa 0xec5(%rip),%xmm0 # 0x2010
0x000000000000114b <+75>: xor %r8d,%r8d
0x000000000000114e <+78>: movabs $0x333231307a797877,%rdi
0x0000000000001158 <+88>: mov %rax,0x10(%rsp)
0x000000000000115d <+93>: mov %rdx,0x20(%rsp)
0x0000000000001162 <+98>: mov $0x3938,%edx
0x0000000000001167 <+103>: movups %xmm0,(%rax)
0x000000000000116a <+106>: movdqa 0xeae(%rip),%xmm0 # 0x2020
0x0000000000001172 <+114>: mov %rdi,0x30(%rax)
0x0000000000001176 <+118>: mov $0x989680,%edi
0x000000000000117b <+123>: movups %xmm0,0x10(%rax)
0x000000000000117f <+127>: movdqa 0xea9(%rip),%xmm0 # 0x2030
0x0000000000001187 <+135>: mov %dx,0x3c(%rax)
0x000000000000118b <+139>: mov 0x10(%rsp),%rdx
0x0000000000001190 <+144>: movl $0x37363534,0x38(%rax)
0x0000000000001197 <+151>: movups %xmm0,0x20(%rax)
0x000000000000119b <+155>: mov 0x8(%rsp),%rax
0x00000000000011a0 <+160>: mov %rax,0x18(%rsp)
0x00000000000011a5 <+165>: movb $0x0,(%rdx,%rax,1)
0x00000000000011a9 <+169>: mov 0x18(%rsp),%rdx
0x00000000000011ae <+174>: mov 0x10(%rsp),%rsi
0x00000000000011b3 <+179>: nopl 0x0(%rax,%rax,1)
0x00000000000011b8 <+184>: test %rdx,%rdx
0x00000000000011bb <+187>: je 0x11da <main()+218>
0x00000000000011bd <+189>: xor %eax,%eax
0x00000000000011bf <+191>: jmp 0x11d1 <main()+209>
0x00000000000011c1 <+193>: nopl 0x0(%rax)
0x00000000000011c8 <+200>: add $0x1,%rax
0x00000000000011cc <+204>: cmp %rdx,%rax
0x00000000000011cf <+207>: je 0x11da <main()+218>
0x00000000000011d1 <+209>: cmpb $0x20,(%rsi,%rax,1)
0x00000000000011d5 <+213>: jne 0x11c8 <main()+200>
0x00000000000011d7 <+215>: add %eax,%r8d
0x00000000000011da <+218>: sub $0x1,%edi
0x00000000000011dd <+221>: jne 0x11b8 <main()+184>
0x00000000000011df <+223>: mov %r8d,%edx
0x00000000000011e2 <+226>: lea 0xe1b(%rip),%rsi # 0x2004
0x00000000000011e9 <+233>: mov $0x1,%edi
0x00000000000011ee <+238>: xor %eax,%eax
0x00000000000011f0 <+240>: call 0x1090 <__printf_chk@plt>
0x00000000000011f5 <+245>: mov 0x10(%rsp),%rdi
--Type <RET> for more, q to quit, c to continue without paging--
0x00000000000011fa <+250>: cmp %rbx,%rdi
0x00000000000011fd <+253>: je 0x120d <main()+269>
0x00000000000011ff <+255>: mov 0x20(%rsp),%rax
0x0000000000001204 <+260>: lea 0x1(%rax),%rsi
0x0000000000001208 <+264>: call 0x10a0 <_ZdlPvm@plt>
0x000000000000120d <+269>: mov 0x38(%rsp),%rax
0x0000000000001212 <+274>: sub %fs:0x28,%rax
0x000000000000121b <+283>: jne 0x1226 <main()+294>
0x000000000000121d <+285>: add $0x48,%rsp
0x0000000000001221 <+289>: xor %eax,%eax
0x0000000000001223 <+291>: pop %rbx
0x0000000000001224 <+292>: pop %rbp
0x0000000000001225 <+293>: ret
0x0000000000001226 <+294>: call 0x10b0 <__stack_chk_fail@plt>
0x000000000000122b <+299>: endbr64
0x000000000000122f <+303>: mov %rax,%rbp
0x0000000000001232 <+306>: jmp 0x10e0 <main.cold>
Address range 0x10e0 to 0x1100:
0x00000000000010e0 <-32>: mov 0x10(%rsp),%rdi
0x00000000000010e5 <-27>: cmp %rbx,%rdi
0x00000000000010e8 <-24>: je 0x10f8 <main()-8>
0x00000000000010ea <-22>: mov 0x20(%rsp),%rax
0x00000000000010ef <-17>: lea 0x1(%rax),%rsi
0x00000000000010f3 <-13>: call 0x10a0 <_ZdlPvm@plt>
0x00000000000010f8 <-8>: mov %rbp,%rdi
0x00000000000010fb <-5>: call 0x10c0 <_Unwind_Resume@plt>
End of assembler dump.
(gdb) disas func
Dump of assembler code for function _Z4funcSt17basic_string_viewIcSt11char_traitsIcEE:
0x0000000000001330 <+0>: endbr64
0x0000000000001334 <+4>: test %rdi,%rdi
0x0000000000001337 <+7>: je 0x1360 <_Z4funcSt17basic_string_viewIcSt11char_traitsIcEE+48>
0x0000000000001339 <+9>: xor %eax,%eax
0x000000000000133b <+11>: jmp 0x1349 <_Z4funcSt17basic_string_viewIcSt11char_traitsIcEE+25>
0x000000000000133d <+13>: nopl (%rax)
0x0000000000001340 <+16>: add $0x1,%rax
0x0000000000001344 <+20>: cmp %rax,%rdi
0x0000000000001347 <+23>: je 0x1360 <_Z4funcSt17basic_string_viewIcSt11char_traitsIcEE+48>
0x0000000000001349 <+25>: cmpb $0x20,(%rsi,%rax,1)
0x000000000000134d <+29>: mov %eax,%r8d
0x0000000000001350 <+32>: jne 0x1340 <_Z4funcSt17basic_string_viewIcSt11char_traitsIcEE+16>
0x0000000000001352 <+34>: mov %r8d,%eax
0x0000000000001355 <+37>: ret
0x0000000000001356 <+38>: cs nopw 0x0(%rax,%rax,1)
0x0000000000001360 <+48>: xor %r8d,%r8d
0x0000000000001363 <+51>: mov %r8d,%eax
0x0000000000001366 <+54>: ret
```
So we can't compare the optimised versions. But even so, the optimised versions of test2 aren't actually faster. That's the weird thing. No string_view is being generated (constructor for string_view isn't being called), but the hardcoded comparison seems to be slower?
Conclusion
Don't use string_view, if you care about speed, as it's not actually faster.
There's just some stuff that never gets benchmarked, but cargo-cult coders will use relentlessly.
I have no idea where C++ standards are headed, but it's not necessarily going in the right direction, in my opinion.