xref: /aosp_15_r20/external/arm-optimized-routines/networking/chksum.c (revision 412f47f9e737e10ed5cc46ec6a8d7fa2264f8a14)
1*412f47f9SXin Li /*
2*412f47f9SXin Li  * Compute 16-bit sum in ones' complement arithmetic (with end-around carry).
3*412f47f9SXin Li  * This sum is often used as a simple checksum in networking.
4*412f47f9SXin Li  *
5*412f47f9SXin Li  * Copyright (c) 2020, Arm Limited.
6*412f47f9SXin Li  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
7*412f47f9SXin Li  */
8*412f47f9SXin Li 
9*412f47f9SXin Li #include "networking.h"
10*412f47f9SXin Li #include "chksum_common.h"
11*412f47f9SXin Li 
12*412f47f9SXin Li always_inline
13*412f47f9SXin Li static inline uint32_t
slurp_head32(const void ** pptr,uint32_t * nbytes)14*412f47f9SXin Li slurp_head32(const void **pptr, uint32_t *nbytes)
15*412f47f9SXin Li {
16*412f47f9SXin Li     uint32_t sum = 0;
17*412f47f9SXin Li     Assert(*nbytes >= 4);
18*412f47f9SXin Li     uint32_t off = (uintptr_t) *pptr % 4;
19*412f47f9SXin Li     if (likely(off != 0))
20*412f47f9SXin Li     {
21*412f47f9SXin Li 	/* Get rid of bytes 0..off-1 */
22*412f47f9SXin Li 	const unsigned char *ptr32 = align_ptr(*pptr, 4);
23*412f47f9SXin Li 	uint32_t mask = ~0U << (CHAR_BIT * off);
24*412f47f9SXin Li 	sum = load32(ptr32) & mask;
25*412f47f9SXin Li 	*pptr = ptr32 + 4;
26*412f47f9SXin Li 	*nbytes -= 4 - off;
27*412f47f9SXin Li     }
28*412f47f9SXin Li     return sum;
29*412f47f9SXin Li }
30*412f47f9SXin Li 
31*412f47f9SXin Li /* Additional loop unrolling would help when not auto-vectorizing */
32*412f47f9SXin Li unsigned short
__chksum(const void * ptr,unsigned int nbytes)33*412f47f9SXin Li __chksum(const void *ptr, unsigned int nbytes)
34*412f47f9SXin Li {
35*412f47f9SXin Li     bool swap = false;
36*412f47f9SXin Li     uint64_t sum = 0;
37*412f47f9SXin Li 
38*412f47f9SXin Li     if (nbytes > 300)
39*412f47f9SXin Li     {
40*412f47f9SXin Li 	/* 4-byte align pointer */
41*412f47f9SXin Li 	swap = (uintptr_t) ptr & 1;
42*412f47f9SXin Li 	sum = slurp_head32(&ptr, &nbytes);
43*412f47f9SXin Li     }
44*412f47f9SXin Li     /* Else benefit of aligning not worth the overhead */
45*412f47f9SXin Li 
46*412f47f9SXin Li     /* Sum all 16-byte chunks */
47*412f47f9SXin Li     const char *cptr = ptr;
48*412f47f9SXin Li     for (uint32_t nquads = nbytes / 16; nquads != 0; nquads--)
49*412f47f9SXin Li     {
50*412f47f9SXin Li 	uint64_t h0 = load32(cptr + 0);
51*412f47f9SXin Li 	uint64_t h1 = load32(cptr + 4);
52*412f47f9SXin Li 	uint64_t h2 = load32(cptr + 8);
53*412f47f9SXin Li 	uint64_t h3 = load32(cptr + 12);
54*412f47f9SXin Li 	sum += h0 + h1 + h2 + h3;
55*412f47f9SXin Li 	cptr += 16;
56*412f47f9SXin Li     }
57*412f47f9SXin Li     nbytes %= 16;
58*412f47f9SXin Li     Assert(nbytes < 16);
59*412f47f9SXin Li 
60*412f47f9SXin Li     /* Handle any trailing 4-byte chunks */
61*412f47f9SXin Li     while (nbytes >= 4)
62*412f47f9SXin Li     {
63*412f47f9SXin Li 	sum += load32(cptr);
64*412f47f9SXin Li 	cptr += 4;
65*412f47f9SXin Li 	nbytes -= 4;
66*412f47f9SXin Li     }
67*412f47f9SXin Li     Assert(nbytes < 4);
68*412f47f9SXin Li 
69*412f47f9SXin Li     if (nbytes & 2)
70*412f47f9SXin Li     {
71*412f47f9SXin Li 	sum += load16(cptr);
72*412f47f9SXin Li 	cptr += 2;
73*412f47f9SXin Li     }
74*412f47f9SXin Li 
75*412f47f9SXin Li     if (nbytes & 1)
76*412f47f9SXin Li     {
77*412f47f9SXin Li 	sum += *(uint8_t *)cptr;
78*412f47f9SXin Li     }
79*412f47f9SXin Li 
80*412f47f9SXin Li     return fold_and_swap(sum, swap);
81*412f47f9SXin Li }
82