Skip to content

Commit 69e3a6a

Browse files
bibo-maochenhuacai
authored andcommitted
LoongArch: Add checksum optimization for 64-bit system
LoongArch platform is 64-bit system, which supports 8-bytes memory accessing, but generic checksum functions use 4-byte memory access. So add 8-bytes memory access optimization for checksum functions on LoongArch. And the code comes from arm64 system. When network hw checksum is disabled, iperf performance improves about 10% with this patch. Signed-off-by: Bibo Mao <maobibo@loongson.cn> Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
1 parent 8941e93 commit 69e3a6a

3 files changed

Lines changed: 208 additions & 1 deletion

File tree

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
/* SPDX-License-Identifier: GPL-2.0-only */
2+
/*
3+
* Copyright (C) 2016 ARM Ltd.
4+
* Copyright (C) 2023 Loongson Technology Corporation Limited
5+
*/
6+
#ifndef __ASM_CHECKSUM_H
7+
#define __ASM_CHECKSUM_H
8+
9+
#include <linux/bitops.h>
10+
#include <linux/in6.h>
11+
12+
#define _HAVE_ARCH_IPV6_CSUM
13+
__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
14+
const struct in6_addr *daddr,
15+
__u32 len, __u8 proto, __wsum sum);
16+
17+
/*
18+
* turns a 32-bit partial checksum (e.g. from csum_partial) into a
19+
* 1's complement 16-bit checksum.
20+
*/
21+
static inline __sum16 csum_fold(__wsum sum)
22+
{
23+
u32 tmp = (__force u32)sum;
24+
25+
/*
26+
* swap the two 16-bit halves of sum
27+
* if there is a carry from adding the two 16-bit halves,
28+
* it will carry from the lower half into the upper half,
29+
* giving us the correct sum in the upper half.
30+
*/
31+
return (__force __sum16)(~(tmp + rol32(tmp, 16)) >> 16);
32+
}
33+
#define csum_fold csum_fold
34+
35+
/*
36+
* This is a version of ip_compute_csum() optimized for IP headers,
37+
* which always checksum on 4 octet boundaries. ihl is the number
38+
* of 32-bit words and is always >= 5.
39+
*/
40+
static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
41+
{
42+
u64 sum;
43+
__uint128_t tmp;
44+
int n = ihl; /* we want it signed */
45+
46+
tmp = *(const __uint128_t *)iph;
47+
iph += 16;
48+
n -= 4;
49+
tmp += ((tmp >> 64) | (tmp << 64));
50+
sum = tmp >> 64;
51+
do {
52+
sum += *(const u32 *)iph;
53+
iph += 4;
54+
} while (--n > 0);
55+
56+
sum += ror64(sum, 32);
57+
return csum_fold((__force __wsum)(sum >> 32));
58+
}
59+
#define ip_fast_csum ip_fast_csum
60+
61+
extern unsigned int do_csum(const unsigned char *buff, int len);
62+
#define do_csum do_csum
63+
64+
#include <asm-generic/checksum.h>
65+
66+
#endif /* __ASM_CHECKSUM_H */

arch/loongarch/lib/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,4 @@
44
#
55

66
lib-y += delay.o memset.o memcpy.o memmove.o \
7-
clear_user.o copy_user.o dump_tlb.o unaligned.o
7+
clear_user.o copy_user.o csum.o dump_tlb.o unaligned.o

arch/loongarch/lib/csum.c

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
// SPDX-License-Identifier: GPL-2.0-only
2+
// Copyright (C) 2019-2020 Arm Ltd.
3+
4+
#include <linux/compiler.h>
5+
#include <linux/kasan-checks.h>
6+
#include <linux/kernel.h>
7+
8+
#include <net/checksum.h>
9+
10+
static u64 accumulate(u64 sum, u64 data)
11+
{
12+
sum += data;
13+
if (sum < data)
14+
sum += 1;
15+
return sum;
16+
}
17+
18+
/*
19+
* We over-read the buffer and this makes KASAN unhappy. Instead, disable
20+
* instrumentation and call kasan explicitly.
21+
*/
22+
unsigned int __no_sanitize_address do_csum(const unsigned char *buff, int len)
23+
{
24+
unsigned int offset, shift, sum;
25+
const u64 *ptr;
26+
u64 data, sum64 = 0;
27+
28+
if (unlikely(len == 0))
29+
return 0;
30+
31+
offset = (unsigned long)buff & 7;
32+
/*
33+
* This is to all intents and purposes safe, since rounding down cannot
34+
* result in a different page or cache line being accessed, and @buff
35+
* should absolutely not be pointing to anything read-sensitive. We do,
36+
* however, have to be careful not to piss off KASAN, which means using
37+
* unchecked reads to accommodate the head and tail, for which we'll
38+
* compensate with an explicit check up-front.
39+
*/
40+
kasan_check_read(buff, len);
41+
ptr = (u64 *)(buff - offset);
42+
len = len + offset - 8;
43+
44+
/*
45+
* Head: zero out any excess leading bytes. Shifting back by the same
46+
* amount should be at least as fast as any other way of handling the
47+
* odd/even alignment, and means we can ignore it until the very end.
48+
*/
49+
shift = offset * 8;
50+
data = *ptr++;
51+
data = (data >> shift) << shift;
52+
53+
/*
54+
* Body: straightforward aligned loads from here on (the paired loads
55+
* underlying the quadword type still only need dword alignment). The
56+
* main loop strictly excludes the tail, so the second loop will always
57+
* run at least once.
58+
*/
59+
while (unlikely(len > 64)) {
60+
__uint128_t tmp1, tmp2, tmp3, tmp4;
61+
62+
tmp1 = *(__uint128_t *)ptr;
63+
tmp2 = *(__uint128_t *)(ptr + 2);
64+
tmp3 = *(__uint128_t *)(ptr + 4);
65+
tmp4 = *(__uint128_t *)(ptr + 6);
66+
67+
len -= 64;
68+
ptr += 8;
69+
70+
/* This is the "don't dump the carry flag into a GPR" idiom */
71+
tmp1 += (tmp1 >> 64) | (tmp1 << 64);
72+
tmp2 += (tmp2 >> 64) | (tmp2 << 64);
73+
tmp3 += (tmp3 >> 64) | (tmp3 << 64);
74+
tmp4 += (tmp4 >> 64) | (tmp4 << 64);
75+
tmp1 = ((tmp1 >> 64) << 64) | (tmp2 >> 64);
76+
tmp1 += (tmp1 >> 64) | (tmp1 << 64);
77+
tmp3 = ((tmp3 >> 64) << 64) | (tmp4 >> 64);
78+
tmp3 += (tmp3 >> 64) | (tmp3 << 64);
79+
tmp1 = ((tmp1 >> 64) << 64) | (tmp3 >> 64);
80+
tmp1 += (tmp1 >> 64) | (tmp1 << 64);
81+
tmp1 = ((tmp1 >> 64) << 64) | sum64;
82+
tmp1 += (tmp1 >> 64) | (tmp1 << 64);
83+
sum64 = tmp1 >> 64;
84+
}
85+
while (len > 8) {
86+
__uint128_t tmp;
87+
88+
sum64 = accumulate(sum64, data);
89+
tmp = *(__uint128_t *)ptr;
90+
91+
len -= 16;
92+
ptr += 2;
93+
94+
data = tmp >> 64;
95+
sum64 = accumulate(sum64, tmp);
96+
}
97+
if (len > 0) {
98+
sum64 = accumulate(sum64, data);
99+
data = *ptr;
100+
len -= 8;
101+
}
102+
/*
103+
* Tail: zero any over-read bytes similarly to the head, again
104+
* preserving odd/even alignment.
105+
*/
106+
shift = len * -8;
107+
data = (data << shift) >> shift;
108+
sum64 = accumulate(sum64, data);
109+
110+
/* Finally, folding */
111+
sum64 += (sum64 >> 32) | (sum64 << 32);
112+
sum = sum64 >> 32;
113+
sum += (sum >> 16) | (sum << 16);
114+
if (offset & 1)
115+
return (u16)swab32(sum);
116+
117+
return sum >> 16;
118+
}
119+
120+
__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
121+
const struct in6_addr *daddr,
122+
__u32 len, __u8 proto, __wsum csum)
123+
{
124+
__uint128_t src, dst;
125+
u64 sum = (__force u64)csum;
126+
127+
src = *(const __uint128_t *)saddr->s6_addr;
128+
dst = *(const __uint128_t *)daddr->s6_addr;
129+
130+
sum += (__force u32)htonl(len);
131+
sum += (u32)proto << 24;
132+
src += (src >> 64) | (src << 64);
133+
dst += (dst >> 64) | (dst << 64);
134+
135+
sum = accumulate(sum, src >> 64);
136+
sum = accumulate(sum, dst >> 64);
137+
138+
sum += ((sum >> 32) | (sum << 32));
139+
return csum_fold((__force __wsum)(sum >> 32));
140+
}
141+
EXPORT_SYMBOL(csum_ipv6_magic);

0 commit comments

Comments
 (0)