Skip to content

Asm versions of netstack memcpy() and lwip_standard_chksum() #51

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 2, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 15 additions & 2 deletions libraries/net/lwip/lwip-sys/arch/cc.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,21 @@ typedef uintptr_t mem_ptr_t;
#define ALIGNED(n) __attribute__((aligned (n)))
#endif

/* Used with IP headers only */
#define LWIP_CHKSUM_ALGORITHM 1
/* Provide Thumb-2 routines for GCC to improve performance */
#if defined(TOOLCHAIN_GCC) && defined(__thumb2__)
#define MEMCPY(dst,src,len) thumb2_memcpy(dst,src,len)
#define LWIP_CHKSUM thumb2_checksum
/* Set algorithm to 0 so that unused lwip_standard_chksum function
doesn't generate compiler warning */
#define LWIP_CHKSUM_ALGORITHM 0

void* thumb2_memcpy(void* pDest, const void* pSource, size_t length);
u16_t thumb2_checksum(void* pData, int length);
#else
/* Used with IP headers only */
#define LWIP_CHKSUM_ALGORITHM 1
#endif


#ifdef LWIP_DEBUG

Expand Down
126 changes: 126 additions & 0 deletions libraries/net/lwip/lwip-sys/arch/checksum.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
/* Copyright (C) 2013 - Adam Green (https://github.com/adamgreen)

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#if defined(TOOLCHAIN_GCC) && defined(__thumb2__)


/* This is a hand written Thumb-2 assembly language version of the
algorithm 3 version of lwip_standard_chksum in lwIP's inet_chksum.c. It
performs the checksumming 32-bits at a time and even unrolls the loop to
perform two of these 32-bit adds per loop iteration.

Returns:
16-bit 1's complement summation (not inversed).

NOTE: This function does return a uint16_t from the assembly language code
but is marked as void so that GCC doesn't issue warning because it
doesn't know about this low level return.
*/
__attribute__((naked)) void /*uint16_t*/ thumb2_checksum(const void* pData, int length)
{
__asm (
".syntax unified\n"
".thumb\n"

// Push non-volatile registers we use on stack. Push link register too to
// keep stack 8-byte aligned and allow single pop to restore and return.
" push {r4, lr}\n"
// Initialize sum, r2, to 0.
" movs r2, #0\n"
// Remember whether pData was at odd address in r3. This is used later to
// know if it needs to swap the result since the summation will be done at
// an offset of 1, rather than 0.
" ands r3, r0, #1\n"
// Need to 2-byte align? If not skip ahead.
" beq 1$\n"
// We can return if there are no bytes to sum.
" cbz r1, 9$\n"

// 2-byte align.
// Place the first data byte in odd summation location since it needs to be
// swapped later. It's ok to overwrite r2 here as it only had a value of 0
// up until now. Advance r0 pointer and decrement r1 length as we go.
" ldrb r2, [r0], #1\n"
" lsls r2, r2, #8\n"
" subs r1, r1, #1\n"

// Need to 4-byte align? If not skip ahead.
"1$:\n"
" ands r4, r0, #3\n"
" beq 2$\n"
// Have more than 1 byte left to align? If not skip ahead to take care of
// trailing byte.
" cmp r1, #2\n"
" blt 7$\n"

// 4-byte align.
" ldrh r4, [r0], #2\n"
" adds r2, r2, r4\n"
" subs r1, r1, #2\n"

// Main summing loop which sums up data 2 words at a time.
// Make sure that we have more than 7 bytes left to sum.
"2$:\n"
" cmp r1, #8\n"
" blt 3$\n"
// Sum next two words. Applying previous upper 16-bit carry to
// lower 16-bits.
" ldr r4, [r0], #4\n"
" adds r2, r4\n"
" adc r2, r2, #0\n"
" ldr r4, [r0], #4\n"
" adds r2, r4\n"
" adc r2, r2, #0\n"
" subs r1, r1, #8\n"
" b 2$\n"

// Sum up any remaining half-words.
"3$:\n"
// Make sure that we have more than 1 byte left to sum.
" cmp r1, #2\n"
" blt 7$\n"
// Sum up next half word, continue to apply carry.
" ldrh r4, [r0], #2\n"
" adds r2, r4\n"
" adc r2, r2, #0\n"
" subs r1, r1, #2\n"
" b 3$\n"

// Handle trailing byte, if it exists
"7$:\n"
" cbz r1, 8$\n"
" ldrb r4, [r0]\n"
" adds r2, r4\n"
" adc r2, r2, #0\n"

// Fold 32-bit checksum into 16-bit checksum.
"8$:\n"
" ubfx r4, r2, #16, #16\n"
" ubfx r2, r2, #0, #16\n"
" adds r2, r4\n"
" ubfx r4, r2, #16, #16\n"
" ubfx r2, r2, #0, #16\n"
" adds r2, r4\n"

// Swap bytes if started at odd address
" cbz r3, 9$\n"
" rev16 r2, r2\n"

// Return final sum.
"9$: mov r0, r2\n"
" pop {r4, pc}\n"
);
}

#endif
59 changes: 59 additions & 0 deletions libraries/net/lwip/lwip-sys/arch/memcpy.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
/* Copyright (C) 2013 - Adam Green (https://github.com/adamgreen)

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#if defined(TOOLCHAIN_GCC) && defined(__thumb2__)

#include <stdio.h>


/* This is a hand written Thumb-2 assembly language version of the
standard C memcpy() function that can be used by the lwIP networking
stack to improve its performance. It copies 4 bytes at a time and
unrolls the loop to perform 4 of these copies per loop iteration.
*/
__attribute__((naked)) void thumb2_memcpy(void* pDest, const void* pSource, size_t length)
{
__asm (
".syntax unified\n"
".thumb\n"

// Copy 16 bytes at a time first.
" lsrs r3, r2, #4\n"
" beq.n 2$\n"
"1$: ldr r12, [r1], #4\n"
" str r12, [r0], #4\n"
" ldr r12, [r1], #4\n"
" str r12, [r0], #4\n"
" ldr r12, [r1], #4\n"
" str r12, [r0], #4\n"
" ldr r12, [r1], #4\n"
" str r12, [r0], #4\n"
" subs r3, #1\n"
" bne 1$\n"

// Copy byte by byte for what is left.
"2$:\n"
" ands r3, r2, #0xf\n"
" beq.n 4$\n"
"3$: ldrb r12, [r1], #1\n"
" strb r12, [r0], #1\n"
" subs r3, #1\n"
" bne 3$\n"

// Return to caller.
"4$: bx lr\n"
);
}

#endif