My program's input is a large string, around 30,000 characters. Below is the code for my own strlen:
size_t strlen(const char *c)
{
int i;
i = 0;
while (c[i] != '\0')
i++;
return (i);
}
The version of strlen above takes ~2.1 seconds to execute. Through a different version, I was able to achieve ~1.4 seconds.
My question is, why are multiple if statements faster than executing a while loop?
size_t strlen(const char *str)
{
const char *start;
start = str;
while (1)
{
if (str[0] == '\0')
return (str - start);
if (str[1] == '\0')
return (str - start + 1);
if (str[2] == '\0')
return (str - start + 2);
if (str[3] == '\0')
return (str - start + 3);
if (str[4] == '\0')
return (str - start + 4);
if (str[5] == '\0')
return (str - start + 5);
if (str[6] == '\0')
return (str - start + 6);
if (str[7] == '\0')
return (str - start + 7);
if (str[8] == '\0')
return (str - start + 8);
str += 9; //
}
}
My question is, why, that alot of if statements, is faster then still running a loop?
Edit: With stantard lib, is something around 1.25 secs.
Your question is pertinent, but your benchmark is incomplete and has surprising results.
Here is a modified and instrumented version of your code:
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <fcntl.h>
#include <unistd.h>
#define VERSION 3
#define TRIALS 100
#define ITERATIONS 100
#if VERSION == 1
size_t strlen1(const char *c) {
size_t i;
i = 0;
while (c[i] != '\0')
i++;
return (i);
}
#define strlen(s) strlen1(s)
#elif VERSION == 2
size_t strlen2(const char *str) {
const char *start;
start = str;
while (1) {
if (str[0] == '\0')
return (str - start);
if (str[1] == '\0')
return (str - start + 1);
if (str[2] == '\0')
return (str - start + 2);
if (str[3] == '\0')
return (str - start + 3);
if (str[4] == '\0')
return (str - start + 4);
if (str[5] == '\0')
return (str - start + 5);
if (str[6] == '\0')
return (str - start + 6);
if (str[7] == '\0')
return (str - start + 7);
if (str[8] == '\0')
return (str - start + 8);
str += 9;
}
}
#define strlen(s) strlen2(s)
#elif VERSION == 3
size_t strlen3(const char *str) {
const uint64_t *px, sub = 0x0101010101010101, mask = 0x8080808080808080;
const char *p;
for (p = str; (uintptr_t)p & 7; p++) {
if (!*p)
return p - str;
}
for (px = (const uint64_t *)(uintptr_t)p;;) {
uint64_t x = *px++;
if (((x - sub) & ~x) & mask)
break;
}
for (p = (const char *)(px - 1); *p; p++)
continue;
return p - str;
}
#define strlen(s) strlen3(s)
#endif
int get_next_line(int fd, char **pp) {
char buf[32768];
char *line = NULL, *new_line;
char *p;
ssize_t line_size = 0;
ssize_t nread, chunk;
while ((nread = read(fd, buf, sizeof buf)) > 0) {
p = memchr(buf, '\n', nread);
chunk = (p == NULL) ? nread : p - buf;
new_line = realloc(line, line_size + chunk + 1);
if (!new_line) {
free(line);
*pp = NULL;
return 0;
}
line = new_line;
memcpy(line + line_size, buf, chunk);
line_size += chunk;
line[line_size] = '\0';
if (p != NULL) {
lseek(fd, chunk + 1 - nread, SEEK_CUR);
break;
}
}
*pp = line;
return line != NULL;
}
int main() {
char *line = NULL;
int fd, fd2, count, trial;
clock_t min_clock = 0;
fd = open("one_big_fat_line.txt", O_RDONLY);
if (fd < 0) {
printf("cannot open one_big_fat_line.txt\n");
return 1;
}
fd2 = open("output.txt", O_WRONLY | O_CREAT | O_TRUNC, S_IREAD | S_IWRITE);
if (fd2 < 0) {
printf("cannot open output.txt\n");
return 1;
}
for (trial = 0; trial < TRIALS; trial++) {
clock_t t = clock();
for (count = 0; count < ITERATIONS; count++) {
lseek(fd, 0L, SEEK_SET);
lseek(fd2, 0L, SEEK_SET);
while (get_next_line(fd, &line) == 1) {
write(fd2, line, strlen(line));
write(fd2, "\n", 1);
free(line);
}
}
t = clock() - t;
if (min_clock == 0 || min_clock > t)
min_clock = t;
}
close(fd);
close(fd2);
double time_taken = (double)min_clock / CLOCKS_PER_SEC;
printf("Version %d time: %.3f microseconds\n", VERSION, time_taken * 1000000 / ITERATIONS);
return 0;
}
The program opens a file, reads lines from it with a custom function read_next_line()
that uses unix system calls and malloc
to return arbitrary sized lines. It then writes these lines using the unix system call write
and appends a newline with a separate system call.
Benchmarking this sequence with your test file, a 30000 byte file with a single line of ASCII characters, shows a very different performance from what you measure: depending on the selected implementation of strlen
and the compilation optimisation settings, the time on my laptop range from 15 microseconds to 82 microseconds per iteration, nowhere close to 1 or 2 seconds as you observe.
Using the C library default implementation, I get 14.5 microseconds per iteration with or without optimisations.
Using your strlen1
naive implementation, I get 82 microseconds with optimisations disabled and 25 microseconds with -O3
optimisations.
Using your strlen2
unrolled implementation, the speed improves to 30 microseconds with -O0
and 20 microseconds with -O3
.
Finally, a more advanced C implementation reading 8 bytes at a time strlen3
provides further improved performance at 21 microseconds with -O0
and 15.5 microseconds with -O3
.
Note how compiler optimisations affect the performance much more than manual optimisations.
The reason your unrolled version performs better is the generated code increments the pointer once per byte and an unconditional jump is performed once per byte, whereas the unrolled version reduces these to once every 9 bytes. Note however that the C compiler gets almost the same performance with -O3
on the naive code as what you get unrolling the loop yourself.
The advanced version is very close in performance to the C library implementation, which may use assembly language with SIMD instructions. It reads 8 bytes at a time and performs an arithmetic trick to detect if any of these bytes has its topmost bit changed from 0
to 1
when subtracting 1
from its value. The extra initial steps are required to align the pointer to read 64-bit words, thus avoiding unaligned reads that have undefined behavior on some architectures. It also assumes that memory protection is not available at the byte level. On modern x86 systems, memory protection has a 4K or larger granularity, but some other systems such as Windows 2.x the protection was much finer grained, preventing this optimisation altogether.
Note however that the benchmark also measures the time to read from the input file, locate the newline and write to the output file. The relative performance of strlen
and strlen3
are probably much more significant. Indeed a separate benchmark for just strlen(line)
with your 30000 byte line shows a time of 2.2 microseconds for strlen3()
and 0.85 microseconds for strlen()
.
Conclusions:
-O3
is a good default.
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.