1*412f47f9SXin Li /* 2*412f47f9SXin Li * Compute 16-bit sum in ones' complement arithmetic (with end-around carry). 3*412f47f9SXin Li * This sum is often used as a simple checksum in networking. 4*412f47f9SXin Li * 5*412f47f9SXin Li * Copyright (c) 2020, Arm Limited. 6*412f47f9SXin Li * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 7*412f47f9SXin Li */ 8*412f47f9SXin Li 9*412f47f9SXin Li #include "networking.h" 10*412f47f9SXin Li #include "chksum_common.h" 11*412f47f9SXin Li 12*412f47f9SXin Li always_inline 13*412f47f9SXin Li static inline uint32_t slurp_head32(const void ** pptr,uint32_t * nbytes)14*412f47f9SXin Lislurp_head32(const void **pptr, uint32_t *nbytes) 15*412f47f9SXin Li { 16*412f47f9SXin Li uint32_t sum = 0; 17*412f47f9SXin Li Assert(*nbytes >= 4); 18*412f47f9SXin Li uint32_t off = (uintptr_t) *pptr % 4; 19*412f47f9SXin Li if (likely(off != 0)) 20*412f47f9SXin Li { 21*412f47f9SXin Li /* Get rid of bytes 0..off-1 */ 22*412f47f9SXin Li const unsigned char *ptr32 = align_ptr(*pptr, 4); 23*412f47f9SXin Li uint32_t mask = ~0U << (CHAR_BIT * off); 24*412f47f9SXin Li sum = load32(ptr32) & mask; 25*412f47f9SXin Li *pptr = ptr32 + 4; 26*412f47f9SXin Li *nbytes -= 4 - off; 27*412f47f9SXin Li } 28*412f47f9SXin Li return sum; 29*412f47f9SXin Li } 30*412f47f9SXin Li 31*412f47f9SXin Li /* Additional loop unrolling would help when not auto-vectorizing */ 32*412f47f9SXin Li unsigned short __chksum(const void * ptr,unsigned int nbytes)33*412f47f9SXin Li__chksum(const void *ptr, unsigned int nbytes) 34*412f47f9SXin Li { 35*412f47f9SXin Li bool swap = false; 36*412f47f9SXin Li uint64_t sum = 0; 37*412f47f9SXin Li 38*412f47f9SXin Li if (nbytes > 300) 39*412f47f9SXin Li { 40*412f47f9SXin Li /* 4-byte align pointer */ 41*412f47f9SXin Li swap = (uintptr_t) ptr & 1; 42*412f47f9SXin Li sum = slurp_head32(&ptr, &nbytes); 43*412f47f9SXin Li } 44*412f47f9SXin Li /* Else benefit of aligning not worth the overhead */ 45*412f47f9SXin Li 46*412f47f9SXin Li /* Sum all 16-byte chunks */ 47*412f47f9SXin Li const char *cptr = ptr; 48*412f47f9SXin Li for (uint32_t nquads = nbytes / 16; nquads != 0; nquads--) 49*412f47f9SXin Li { 50*412f47f9SXin Li uint64_t h0 = load32(cptr + 0); 51*412f47f9SXin Li uint64_t h1 = load32(cptr + 4); 52*412f47f9SXin Li uint64_t h2 = load32(cptr + 8); 53*412f47f9SXin Li uint64_t h3 = load32(cptr + 12); 54*412f47f9SXin Li sum += h0 + h1 + h2 + h3; 55*412f47f9SXin Li cptr += 16; 56*412f47f9SXin Li } 57*412f47f9SXin Li nbytes %= 16; 58*412f47f9SXin Li Assert(nbytes < 16); 59*412f47f9SXin Li 60*412f47f9SXin Li /* Handle any trailing 4-byte chunks */ 61*412f47f9SXin Li while (nbytes >= 4) 62*412f47f9SXin Li { 63*412f47f9SXin Li sum += load32(cptr); 64*412f47f9SXin Li cptr += 4; 65*412f47f9SXin Li nbytes -= 4; 66*412f47f9SXin Li } 67*412f47f9SXin Li Assert(nbytes < 4); 68*412f47f9SXin Li 69*412f47f9SXin Li if (nbytes & 2) 70*412f47f9SXin Li { 71*412f47f9SXin Li sum += load16(cptr); 72*412f47f9SXin Li cptr += 2; 73*412f47f9SXin Li } 74*412f47f9SXin Li 75*412f47f9SXin Li if (nbytes & 1) 76*412f47f9SXin Li { 77*412f47f9SXin Li sum += *(uint8_t *)cptr; 78*412f47f9SXin Li } 79*412f47f9SXin Li 80*412f47f9SXin Li return fold_and_swap(sum, swap); 81*412f47f9SXin Li } 82