The memset from season-1

/* This file contains a variety of implementations of memset. If you don't know
 * what memset is `man memset` may enlighten you. Its definition is
 *
 *  void* memset(void* b, int c, size_t len);
 *
 * I wrote this code partly as a gentle introduction to memset and partly as a
 * reminder to myself of how memset works. This code is in the public domain and
 * you are free to use it for any purpose (commercial or non-commercial) with or
 * without attribution to me. I don't guarantee the correctness of any of the
 * code herein and if you spot a mistake please let me know and I'll correct it.
 *
 * I wrote this code during my free time, but at a period during which I was
 * employed by National ICT Australia and it was heavily related to my work. If
 * push comes to shove it may be considered their intellectual property, but as
 * it is not functional freestanding code and (better) implementations of memset
 * are widely available, I hope they won't mind :)
 *
 * Matthew Fernandez, 2011
 */

#include <string.h>
#include <stdio.h>
#include <stdint.h>
#include <assert.h>

/* Just for convenience let's setup a type for bytes. */
typedef unsigned char byte;

/* Let's start off with a fairly naive implementation of memset. This sets
 * memory byte-by-byte. While not being particularly efficient and being
 * slightly braindead, it does have the advantage of being readily
 * understandable and reasonably straightforward to implement without making
 * mistakes.
 */
void* bytewise_memset(void* s, int c, size_t sz) {
    byte* p = (byte*)s;

    /* c should only be a byte's worth of information anyway, but let's mask out
     * everything else just in case.
     */
    byte x = c & 0xff;

    while (sz--)
        *p++ = x;
    return s;
}

/* A smarter way of doing memset is word-by-word. Your architecture will have a
 * native word size (32 bits on x86) and reads/writes at this granularity will
 * be more efficient than those at finer granularities. Let's take a look at
 * memset for a 32-bit architecture.
 */
void* wordwise_32_memset(void* s, int c, size_t sz) {
    uint32_t* p = (uint32_t*)s;

    /* In this case the masking is actually important. */
    uint32_t x = c & 0xff;

    /* Construct a word's worth of the value we're supposed to be setting. */
    x |= x << 8;
    x |= x << 16;

    /* This technique (without a prologue and epilogue) will only cope with
     * sizes that are word-aligned. For example, you cannot use this function to
     * set a region 7 bytes long. Let's do some checks to make sure the size
     * passed is actually something we can cope with. It is worth noting that in
     * practice you would actually want to check the alignment of the start
     * pointer as well. Doing a word-wise memset on an unaligned pointer gains
     * you nothing and may even hurt performance.
     */
    assert(!(sz & 3)); /* Check sz is 4-byte-aligned. */
    sz >>= 2; /* Divide by number of bytes in a word. */

    while (sz--)
        *p++ = x;
    return s;
}

/* Now let's do an architecture-independent word-by-word memset. You would never
 * want to implement this in practice, as you are relying on the compiler to
 * optimise the statically determinable loops that could be manually written
 * much more efficiently if you know the target architecture, but this is a
 * useful thought exercise. Note that GCC defines the handy constant __WORDSIZE
 * that tells us the size (in bits) of words on this architecture.
 */
void* wordwise_memset(void* s, int c, size_t sz) {
    uintptr_t* p = (uintptr_t*)s;
    uintptr_t x = c & 0xff;
    int i;
    int bytes_per_word;

    /* Construct a word's worth of the byte value we need to set. */
    for (i = 3; (1<<i) < __WORDSIZE; ++i)
        x |= x << (1<<i);
    /* At this point i is the power of 2 which is equal to the word size. */

    /* 1<<i would be the number of bits per word, therefore (1<<i)/8 == 1<<(i-3)
     * is the number of bytes per word.
     */
    bytes_per_word = 1<<(i-3);

    /* Check sz is bytes_per_word-byte-aligned. */
    assert(!(sz & (bytes_per_word-1)));
    sz >>= i-3;

    while (sz--)
        *p++ = x;
    return s;
}

/* Let's return to the 32-bit word-wise version briefly to implement a version
 * that handles unaligned (with respect to word size) pointers and size. To
 * understand what's going on here it's best to look at an example:
 *
 *  Calling wordwise_32_unaligned_memset(2, 0, 7)...
 * Initial:             +-+-+-+-+-+-+-+
 *                      |2|3|4|5|6|7|8|  pp = 2, p = ?
 *                      +-+-+-+-+-+-+-+  sz = 7, tail = ?
 *                      |?|?|?|?|?|?|?|
 *                      +-+-+-+-+-+-+-+
 *
 * After the prologue:  +-+-+-+-+-+-+-+ Now the pointer (pp/p) is aligned.
 *                      |2|3|4|5|6|7|8|  pp = 4, p = 4
 *                      +-+-+-+-+-+-+-+  sz = 5, tail = 1
 *                      |0|0|?|?|?|?|?| (then we adjust sz to 5>>2 == 1)
 *                      +-+-+-+-+-+-+-+
 *
 * After the main loop: +-+-+-+-+-+-+-+ We can't set any more word length
 *                      |2|3|4|5|6|7|8| regions, but there's still one byte
 *                      +-+-+-+-+-+-+-+ remaining.
 *                      |0|0|0|0|0|0|?|  pp = 8, p = 8
 *                      +-+-+-+-+-+-+-+  sz = 0, tail = 1
 *
 * After the epilogue:  +-+-+-+-+-+-+-+ Now we're done.
 *                      |2|3|4|5|6|7|8|  pp = 9, p = 8
 *                      +-+-+-+-+-+-+-+  sz = 0, tail = 0
 *                      |0|0|0|0|0|0|0|
 *                      +-+-+-+-+-+-+-+
 */
void* wordwise_32_unaligned_memset(void* s, int c, size_t sz) {
    uint32_t* p;
    uint32_t x = c & 0xff;
    byte xx = c & 0xff;
    byte* pp = (byte*)s;
    size_t tail;

    /* Let's introduce a prologue to bump the starting location forward to the
     * next alignment boundary.
     */
    while (((unsigned int)pp & 3) && sz--)
        *pp++ = xx;
    p = (uint32_t*)pp;

    /* Let's figure out the number of bytes that will be trailing when the
     * word-wise loop taps out.
     */
    tail = sz & 3;

    /* The middle of this function is identical to the wordwise_32_memset minus
     * the alignment checks.
     */
    x |= x << 8;
    x |= x << 16;

    sz >>= 2;

    while (sz--)
        *p++ = x;

    /* Now we introduce an epilogue to account for the trailing bytes. */
    pp = (byte*)p;
    while (tail--)
        *pp++ = xx;

    return s;
}

/* Let's take the final logical step and implement an architecture-independent
 * memset that can cope with unaligned pointers and sizes. Note, the same
 * caveat as for wordwise_memset applies; you wouldn't write code like this in
 * real life.
 */
void* wordwise_unaligned_memset(void* s, int c, size_t sz) {
    uintptr_t* p;
    uintptr_t x = c & 0xff;
    byte* pp = (byte*)s;
    byte xx = c & 0xff;
    size_t tail;
    int i;
    int bytes_per_word;

    for (i = 3; (1<<i) < __WORDSIZE; ++i)
        x |= x << (1<<i);
    bytes_per_word = 1<<(i-3);

    /* Prologue. */
    while (((unsigned int)pp & (bytes_per_word-1)) && sz--)
        *pp++ = xx;
    tail = sz & (bytes_per_word-1);
    p = (uintptr_t*)pp;

    /* Main loop. */
    sz >>= i-3;
    while (sz--)
        *p++ = x;

    /* Epilogue. */
    pp = (byte*)p;
    while (tail--)
        *pp++ = xx;

    return s;
}

/* Finally, just for fun, let's look at memset using Duff's Device
 * (http://www.lysator.liu.se/c/duffs-device.html). The original Duff's Device
 * was not intended to target memset, but was aimed at solving the problem of
 * copying data from an array into a memory mapped hardware register. As a
 * result, it's not particularly suited to memset. Of course there's a big
 * "but" with this. As with all the algorithms in this file, the relative
 * performance is highly architecture- and compiler-dependent. If you want to
 * know the fastest algorithm for a given scenario you really have no choice
 * but to look at the generated assembly code.
 */
void* duffs_device_memset(void* s, int c, size_t sz) {
    byte* p = (byte*)s;
    byte x = c & 0xff;
    unsigned int leftover = sz & 0x7;

    /* Catch the pathological case of 0. */
    if (!sz)
        return s;

    /* To understand what's going on here, take a look at the original
     * bytewise_memset and consider unrolling the loop. For this situation
     * we'll unroll the loop 8 times (assuming a 32-bit architecture). Choosing
     * the level to which to unroll the loop can be a fine art...
     */
    sz = (sz + 7) >> 3;
    switch (leftover) {
        case 0: do { *p++ = x;
        case 7:      *p++ = x;
        case 6:      *p++ = x;
        case 5:      *p++ = x;
        case 4:      *p++ = x;
        case 3:      *p++ = x;
        case 2:      *p++ = x;
        case 1:      *p++ = x;
                } while (--sz > 0);
    }
    return s;
}

/* Lines below here are instrumentation for testing your implementation. */

#define CHECK(f, unaligned) \
    do { \
        int fail_byte = check_memset((f), (unaligned)); \
        if (fail_byte) \
            printf("%s %s check failed on byte %d.\n", \
                   (unaligned) ? "Unaligned" : "Aligned", #f, fail_byte - 1); \
    } while(0)

#define BUFFER_LEN 4096

/* This function does some very basic checking of your memset function. If you
 * really want to comprehensively test your function you should check the
 * regions either side of the memory you are setting to make sure your function
 * is actually obeying the limits passed to it.
 *
 * Note, the argument unaligned determines whether we pass an unaligned pointer
 * and size or not. Only certain of the above implementations will pass this
 * test.
 */
int check_memset(void* f(), int unaligned) {
    char buffer[BUFFER_LEN];
    int i;
    char set;

    for (set = 0; (unsigned int)set <= 0xff; ++set) {
        f(buffer + unaligned, set, BUFFER_LEN - 2*unaligned);
        for (i = unaligned; i < BUFFER_LEN - 2*unaligned; ++i)
            if (buffer[i] != set)
                return i + 1;
    }

    return 0;
}

/* When executed, this program will just validate the implementations in this
 * file. Note that the unaligned tests are only run on the functions that can
 * cope with unaligned values.
 */
int main(int argc, char** argv) {

    /* Use GCC's built-in memset to validate our checking function. */
    CHECK(memset, 0);
    CHECK(memset, 1);

    /* Check our implementations. */
    CHECK(bytewise_memset, 0);
    CHECK(bytewise_memset, 1);
    CHECK(wordwise_32_memset, 0);
    CHECK(wordwise_memset, 0);
    CHECK(wordwise_32_unaligned_memset, 0);
    CHECK(wordwise_32_unaligned_memset, 1);
    CHECK(wordwise_unaligned_memset, 0);
    CHECK(wordwise_unaligned_memset, 1);
    CHECK(duffs_device_memset, 0);
    CHECK(duffs_device_memset, 1);

    return 0;
}
season-1 / memset Goto Github PK

memset's Introduction

memset's People

Contributors

Recommend Projects

React

Vue.js

Typescript

TensorFlow

Django

Laravel

D3

Recommend Topics

javascript

web

server

Machine learning

Visualization

Game

Recommend Org

Facebook

Microsoft

Google

Alibaba

D3

Tencent