Skip to content

Commit 42e27e7

Browse files
committed
Merge pull request #51 from adamgreen/netMorePerformanceWork
Asm versions of netstack memcpy() and lwip_standard_chksum() [Note] I'm generally a bit reluctant when including optimizations like this (from an architectural standpoint), because they tend to be a bit too specific (for example, this one works only with lwIP+GCC+Cortex-M3 or M4), but for now it looks as this is the right place for them, although the optimized memcpy should ideally be in libc (or even better replaced with a DMA transfer in this particular case). But this will be both a nice optimization and a reminder of what we need to implement/change in the future.
2 parents f44914d + 7dddd9e commit 42e27e7

File tree

3 files changed

+200
-2
lines changed

3 files changed

+200
-2
lines changed

libraries/net/lwip/lwip-sys/arch/cc.h

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -82,8 +82,21 @@ typedef uintptr_t mem_ptr_t;
8282
#define ALIGNED(n) __attribute__((aligned (n)))
8383
#endif
8484

85-
/* Used with IP headers only */
86-
#define LWIP_CHKSUM_ALGORITHM 1
85+
/* Provide Thumb-2 routines for GCC to improve performance */
86+
#if defined(TOOLCHAIN_GCC) && defined(__thumb2__)
87+
#define MEMCPY(dst,src,len) thumb2_memcpy(dst,src,len)
88+
#define LWIP_CHKSUM thumb2_checksum
89+
/* Set algorithm to 0 so that unused lwip_standard_chksum function
90+
doesn't generate compiler warning */
91+
#define LWIP_CHKSUM_ALGORITHM 0
92+
93+
void* thumb2_memcpy(void* pDest, const void* pSource, size_t length);
94+
u16_t thumb2_checksum(void* pData, int length);
95+
#else
96+
/* Used with IP headers only */
97+
#define LWIP_CHKSUM_ALGORITHM 1
98+
#endif
99+
87100

88101
#ifdef LWIP_DEBUG
89102

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
/* Copyright (C) 2013 - Adam Green (https://github.com/adamgreen)
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License.
14+
*/
15+
#if defined(TOOLCHAIN_GCC) && defined(__thumb2__)
16+
17+
18+
/* This is a hand written Thumb-2 assembly language version of the
19+
algorithm 3 version of lwip_standard_chksum in lwIP's inet_chksum.c. It
20+
performs the checksumming 32-bits at a time and even unrolls the loop to
21+
perform two of these 32-bit adds per loop iteration.
22+
23+
Returns:
24+
16-bit 1's complement summation (not inversed).
25+
26+
NOTE: This function does return a uint16_t from the assembly language code
27+
but is marked as void so that GCC doesn't issue warning because it
28+
doesn't know about this low level return.
29+
*/
30+
__attribute__((naked)) void /*uint16_t*/ thumb2_checksum(const void* pData, int length)
31+
{
32+
__asm (
33+
".syntax unified\n"
34+
".thumb\n"
35+
36+
// Push non-volatile registers we use on stack. Push link register too to
37+
// keep stack 8-byte aligned and allow single pop to restore and return.
38+
" push {r4, lr}\n"
39+
// Initialize sum, r2, to 0.
40+
" movs r2, #0\n"
41+
// Remember whether pData was at odd address in r3. This is used later to
42+
// know if it needs to swap the result since the summation will be done at
43+
// an offset of 1, rather than 0.
44+
" ands r3, r0, #1\n"
45+
// Need to 2-byte align? If not skip ahead.
46+
" beq 1$\n"
47+
// We can return if there are no bytes to sum.
48+
" cbz r1, 9$\n"
49+
50+
// 2-byte align.
51+
// Place the first data byte in odd summation location since it needs to be
52+
// swapped later. It's ok to overwrite r2 here as it only had a value of 0
53+
// up until now. Advance r0 pointer and decrement r1 length as we go.
54+
" ldrb r2, [r0], #1\n"
55+
" lsls r2, r2, #8\n"
56+
" subs r1, r1, #1\n"
57+
58+
// Need to 4-byte align? If not skip ahead.
59+
"1$:\n"
60+
" ands r4, r0, #3\n"
61+
" beq 2$\n"
62+
// Have more than 1 byte left to align? If not skip ahead to take care of
63+
// trailing byte.
64+
" cmp r1, #2\n"
65+
" blt 7$\n"
66+
67+
// 4-byte align.
68+
" ldrh r4, [r0], #2\n"
69+
" adds r2, r2, r4\n"
70+
" subs r1, r1, #2\n"
71+
72+
// Main summing loop which sums up data 2 words at a time.
73+
// Make sure that we have more than 7 bytes left to sum.
74+
"2$:\n"
75+
" cmp r1, #8\n"
76+
" blt 3$\n"
77+
// Sum next two words. Applying previous upper 16-bit carry to
78+
// lower 16-bits.
79+
" ldr r4, [r0], #4\n"
80+
" adds r2, r4\n"
81+
" adc r2, r2, #0\n"
82+
" ldr r4, [r0], #4\n"
83+
" adds r2, r4\n"
84+
" adc r2, r2, #0\n"
85+
" subs r1, r1, #8\n"
86+
" b 2$\n"
87+
88+
// Sum up any remaining half-words.
89+
"3$:\n"
90+
// Make sure that we have more than 1 byte left to sum.
91+
" cmp r1, #2\n"
92+
" blt 7$\n"
93+
// Sum up next half word, continue to apply carry.
94+
" ldrh r4, [r0], #2\n"
95+
" adds r2, r4\n"
96+
" adc r2, r2, #0\n"
97+
" subs r1, r1, #2\n"
98+
" b 3$\n"
99+
100+
// Handle trailing byte, if it exists
101+
"7$:\n"
102+
" cbz r1, 8$\n"
103+
" ldrb r4, [r0]\n"
104+
" adds r2, r4\n"
105+
" adc r2, r2, #0\n"
106+
107+
// Fold 32-bit checksum into 16-bit checksum.
108+
"8$:\n"
109+
" ubfx r4, r2, #16, #16\n"
110+
" ubfx r2, r2, #0, #16\n"
111+
" adds r2, r4\n"
112+
" ubfx r4, r2, #16, #16\n"
113+
" ubfx r2, r2, #0, #16\n"
114+
" adds r2, r4\n"
115+
116+
// Swap bytes if started at odd address
117+
" cbz r3, 9$\n"
118+
" rev16 r2, r2\n"
119+
120+
// Return final sum.
121+
"9$: mov r0, r2\n"
122+
" pop {r4, pc}\n"
123+
);
124+
}
125+
126+
#endif
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
/* Copyright (C) 2013 - Adam Green (https://github.com/adamgreen)
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License.
14+
*/
15+
#if defined(TOOLCHAIN_GCC) && defined(__thumb2__)
16+
17+
#include <stdio.h>
18+
19+
20+
/* This is a hand written Thumb-2 assembly language version of the
21+
standard C memcpy() function that can be used by the lwIP networking
22+
stack to improve its performance. It copies 4 bytes at a time and
23+
unrolls the loop to perform 4 of these copies per loop iteration.
24+
*/
25+
__attribute__((naked)) void thumb2_memcpy(void* pDest, const void* pSource, size_t length)
26+
{
27+
__asm (
28+
".syntax unified\n"
29+
".thumb\n"
30+
31+
// Copy 16 bytes at a time first.
32+
" lsrs r3, r2, #4\n"
33+
" beq.n 2$\n"
34+
"1$: ldr r12, [r1], #4\n"
35+
" str r12, [r0], #4\n"
36+
" ldr r12, [r1], #4\n"
37+
" str r12, [r0], #4\n"
38+
" ldr r12, [r1], #4\n"
39+
" str r12, [r0], #4\n"
40+
" ldr r12, [r1], #4\n"
41+
" str r12, [r0], #4\n"
42+
" subs r3, #1\n"
43+
" bne 1$\n"
44+
45+
// Copy byte by byte for what is left.
46+
"2$:\n"
47+
" ands r3, r2, #0xf\n"
48+
" beq.n 4$\n"
49+
"3$: ldrb r12, [r1], #1\n"
50+
" strb r12, [r0], #1\n"
51+
" subs r3, #1\n"
52+
" bne 3$\n"
53+
54+
// Return to caller.
55+
"4$: bx lr\n"
56+
);
57+
}
58+
59+
#endif

0 commit comments

Comments
 (0)