From 97034edb35e92361daaa24512989d00f6c3fd517 Mon Sep 17 00:00:00 2001 From: Paul Selkirk Date: Mon, 26 Nov 2018 17:26:55 -0500 Subject: Add loop unrolling to bring the profilable mem* functions closer to newlib, because memset is called a LOT in the course of RSA signing, and we need to understand how much time we're actually spending there. --- libraries/libprof/Makefile | 6 ++- libraries/libprof/memfunc.c | 127 ++++++++++++++++++++++++++++++++++++++++++++ memfunc.c | 101 ----------------------------------- projects/hsm/Makefile | 1 - 4 files changed, 132 insertions(+), 103 deletions(-) create mode 100644 libraries/libprof/memfunc.c delete mode 100644 memfunc.c diff --git a/libraries/libprof/Makefile b/libraries/libprof/Makefile index 4fe5fb4..37b9a23 100644 --- a/libraries/libprof/Makefile +++ b/libraries/libprof/Makefile @@ -1,12 +1,16 @@ LIB = libprof.a -OBJS = gmon.o profil.o profiler.o +OBJS = gmon.o profil.o profiler.o memfunc.o # Don't profile the profiling code, because that way lies madness (and recursion). CFLAGS := $(subst -pg,,$(CFLAGS)) all: $(LIB) +# But do profile the mem functions +memfunc.o: memfunc.c + $(CC) $(CFLAGS) -pg -c -o $@ $< + %.o : %.c $(CC) $(CFLAGS) -c -o $@ $< diff --git a/libraries/libprof/memfunc.c b/libraries/libprof/memfunc.c new file mode 100644 index 0000000..fc908e1 --- /dev/null +++ b/libraries/libprof/memfunc.c @@ -0,0 +1,127 @@ +#include +#include + +/* + * Profilable substitutes for mem*(), lacking libc_p.a + * + * This code was written with reference to newlib, and was recently + * brought closer into line with newlib, to make profiling more accurate. + * + * Newlib is maintained by Cygwin, which is Red Hat. There is no copyright + * statement in the corresponding newlib source files, nor is there a + * COPYING file in newlib/libc/string or newlib/libc. Consider this file + * to be covered under one or more of the 50 copyright notices in + * newlib/COPYING, most of which are BSD. In any case, this file is only + * used for profiling, and is not used in production builds. + */ + +#define is_word_aligned(x) (((size_t)(x) & 3) == 0) + +void *memcpy(void *dst, const void *src, size_t n) +{ + uint8_t *d8 = (uint8_t *)dst; + uint8_t *s8 = (uint8_t *)src; + + if (n >= sizeof(uint32_t) && is_word_aligned(src) && is_word_aligned(dst)) { + uint32_t *d32 = (uint32_t *)dst; + uint32_t *s32 = (uint32_t *)src; + while (n >= 4 * sizeof(uint32_t)) { + *d32++ = *s32++; + *d32++ = *s32++; + *d32++ = *s32++; + *d32++ = *s32++; + n -= 4 * sizeof(uint32_t); + } + while (n >= sizeof(uint32_t)) { + *d32++ = *s32++; + n -= sizeof(uint32_t); + } + d8 = (uint8_t *)d32; + s8 = (uint8_t *)s32; + } + while (n-- > 0) { + *d8++ = *s8++; + } + + return dst; +} + +void *memset(void *dst, int c, size_t n) +{ + uint8_t *d8 = (uint8_t *)dst; + uint8_t c8 = (uint8_t)c; + + while (!is_word_aligned(d8)) { + if (n--) + *d8++ = c8; + else + return dst; + } + if (n >= sizeof(uint32_t)) { + uint32_t *d32 = (uint32_t *)d8; + uint32_t c32 = (c8 << 24) | (c8 << 16) | (c8 << 8) | (c8); + while (n >= 4 * sizeof(uint32_t)) { + *d32++ = c32; + *d32++ = c32; + *d32++ = c32; + *d32++ = c32; + n -= 4 * sizeof(uint32_t); + } + while (n >= sizeof(uint32_t)) { + *d32++ = c32; + n -= sizeof(uint32_t); + } + d8 = (uint8_t *)d32; + } + while (n-- > 0) { + *d8++ = c8; + } + + return dst; +} + +int memcmp(const void *dst, const void *src, size_t n) +{ + uint8_t *d8 = (uint8_t *)dst; + uint8_t *s8 = (uint8_t *)src; + + if (n >= sizeof(uint32_t) && is_word_aligned(src) && is_word_aligned(dst)) { + uint32_t *d32 = (uint32_t *)dst; + uint32_t *s32 = (uint32_t *)src; + while (n >= sizeof(uint32_t)) { + if (*d32 != *s32) + break; + d32++; + s32++; + n -= sizeof(uint32_t); + } + d8 = (uint8_t *)d32; + s8 = (uint8_t *)s32; + } + while (n-- > 0) { + if (*d8 != *s8) + return (*d8 - *s8); + d8++; + s8++; + } + + return 0; +} + +void *memmove(void *dst, const void *src, size_t n) +{ + uint8_t *d8 = (uint8_t *)dst; + uint8_t *s8 = (uint8_t *)src; + + if ((s8 < d8) && (d8 < s8 + n)) { + /* Destructive overlap...have to copy backwards */ + s8 += n; + d8 += n; + while (n-- > 0) { + *--d8 = *--s8; + } + return dst; + } + + return memcpy(dst, src, n); +} diff --git a/memfunc.c b/memfunc.c deleted file mode 100644 index fd94b28..0000000 --- a/memfunc.c +++ /dev/null @@ -1,101 +0,0 @@ -#include -#include - -/* - * Profilable substitutes for mem*(), lacking libc_p.a - * - * This code was written with reference to newlib, but does not copy every - * quirk and loop-unrolling optimization from newlib. Its only purpose is - * to let us figure out who is calling memcpy 2 million times. - */ - -#define is_word_aligned(x) (((size_t)(x) & 3) == 0) - -void *memcpy(void *dst, const void *src, size_t n) -{ - uint8_t *d8 = (uint8_t *)dst; - uint8_t *s8 = (uint8_t *)src; - - if (n >= 4 && is_word_aligned(src) && is_word_aligned(dst)) { - uint32_t *d32 = (uint32_t *)dst; - uint32_t *s32 = (uint32_t *)src; - while (n >= 4) { - *d32++ = *s32++; - n -= 4; - } - d8 = (uint8_t *)d32; - s8 = (uint8_t *)s32; - } - while (n-- > 0) { - *d8++ = *s8++; - } - - return dst; -} - -void *memset(void *dst, int c, size_t n) -{ - uint8_t *d8 = (uint8_t *)dst; - uint8_t c8 = (uint8_t)c; - - if (n >= 4 && is_word_aligned(dst)) { - uint32_t *d32 = (uint32_t *)dst; - uint32_t c32 = (c8 << 24) | (c8 << 16) | (c8 << 8) | (c8); - while (n >= 4) { - *d32++ = c32; - n -= 4; - } - d8 = (uint8_t *)d32; - } - while (n-- > 0) { - *d8++ = c8; - } - - return dst; -} - -int memcmp(const void *dst, const void *src, size_t n) -{ - uint8_t *d8 = (uint8_t *)dst; - uint8_t *s8 = (uint8_t *)src; - - if (n >= 4 && is_word_aligned(src) && is_word_aligned(dst)) { - uint32_t *d32 = (uint32_t *)dst; - uint32_t *s32 = (uint32_t *)src; - while (n >= 4) { - if (*d32 != *s32) - break; - d32++; - s32++; - n -= 4; - } - d8 = (uint8_t *)d32; - s8 = (uint8_t *)s32; - } - while (n-- > 0) { - if (*d8 != *s8) - return (*d8 - *s8); - d8++; - s8++; - } - - return 0; -} - -void *memmove(void *dst, const void *src, size_t n) -{ - uint8_t *d8 = (uint8_t *)dst; - uint8_t *s8 = (uint8_t *)src; - - if ((s8 < d8) && (d8 < s8 + n)) { - /* Destructive overlap...have to copy backwards */ - s8 += n; - d8 += n; - while (n-- > 0) { - *--d8 = *--s8; - } - return dst; - } - - return memcpy(dst, src, n); -} diff --git a/projects/hsm/Makefile b/projects/hsm/Makefile index 3430e14..37c552d 100644 --- a/projects/hsm/Makefile +++ b/projects/hsm/Makefile @@ -25,7 +25,6 @@ LDFLAGS += -mfloat-abi=hard -mfpu=fpv4-sp-d16 LDFLAGS += -Wl,--gc-sections ifdef DO_PROFILING -OBJS += $(TOPLEVEL)/memfunc.o LDFLAGS += --specs=rdimon.specs -lc -lrdimon endif -- cgit v1.2.3