|
| 1 | +/* Copyright (C) 2013 - Adam Green (https://github.com/adamgreen) |
| 2 | +
|
| 3 | + Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | + you may not use this file except in compliance with the License. |
| 5 | + You may obtain a copy of the License at |
| 6 | +
|
| 7 | + http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +
|
| 9 | + Unless required by applicable law or agreed to in writing, software |
| 10 | + distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | + See the License for the specific language governing permissions and |
| 13 | + limitations under the License. |
| 14 | +*/ |
| 15 | +#if defined(TOOLCHAIN_GCC) && defined(__thumb2__) |
| 16 | + |
| 17 | + |
| 18 | +/* This is a hand written Thumb-2 assembly language version of the |
| 19 | + algorithm 3 version of lwip_standard_chksum in lwIP's inet_chksum.c. It |
| 20 | + performs the checksumming 32-bits at a time and even unrolls the loop to |
| 21 | + perform two of these 32-bit adds per loop iteration. |
| 22 | + |
| 23 | + Returns: |
| 24 | + 16-bit 1's complement summation (not inversed). |
| 25 | + |
| 26 | + NOTE: This function does return a uint16_t from the assembly language code |
| 27 | + but is marked as void so that GCC doesn't issue warning because it |
| 28 | + doesn't know about this low level return. |
| 29 | +*/ |
| 30 | +__attribute__((naked)) void /*uint16_t*/ thumb2_checksum(const void* pData, int length) |
| 31 | +{ |
| 32 | + __asm ( |
| 33 | + ".syntax unified\n" |
| 34 | + ".thumb\n" |
| 35 | + |
| 36 | + // Push non-volatile registers we use on stack. Push link register too to |
| 37 | + // keep stack 8-byte aligned and allow single pop to restore and return. |
| 38 | + " push {r4, lr}\n" |
| 39 | + // Initialize sum, r2, to 0. |
| 40 | + " movs r2, #0\n" |
| 41 | + // Remember whether pData was at odd address in r3. This is used later to |
| 42 | + // know if it needs to swap the result since the summation will be done at |
| 43 | + // an offset of 1, rather than 0. |
| 44 | + " ands r3, r0, #1\n" |
| 45 | + // Need to 2-byte align? If not skip ahead. |
| 46 | + " beq 1$\n" |
| 47 | + // We can return if there are no bytes to sum. |
| 48 | + " cbz r1, 9$\n" |
| 49 | + |
| 50 | + // 2-byte align. |
| 51 | + // Place the first data byte in odd summation location since it needs to be |
| 52 | + // swapped later. It's ok to overwrite r2 here as it only had a value of 0 |
| 53 | + // up until now. Advance r0 pointer and decrement r1 length as we go. |
| 54 | + " ldrb r2, [r0], #1\n" |
| 55 | + " lsls r2, r2, #8\n" |
| 56 | + " subs r1, r1, #1\n" |
| 57 | + |
| 58 | + // Need to 4-byte align? If not skip ahead. |
| 59 | + "1$:\n" |
| 60 | + " ands r4, r0, #3\n" |
| 61 | + " beq 2$\n" |
| 62 | + // Have more than 1 byte left to align? If not skip ahead to take care of |
| 63 | + // trailing byte. |
| 64 | + " cmp r1, #2\n" |
| 65 | + " blt 7$\n" |
| 66 | + |
| 67 | + // 4-byte align. |
| 68 | + " ldrh r4, [r0], #2\n" |
| 69 | + " adds r2, r2, r4\n" |
| 70 | + " subs r1, r1, #2\n" |
| 71 | + |
| 72 | + // Main summing loop which sums up data 2 words at a time. |
| 73 | + // Make sure that we have more than 7 bytes left to sum. |
| 74 | + "2$:\n" |
| 75 | + " cmp r1, #8\n" |
| 76 | + " blt 3$\n" |
| 77 | + // Sum next two words. Applying previous upper 16-bit carry to |
| 78 | + // lower 16-bits. |
| 79 | + " ldr r4, [r0], #4\n" |
| 80 | + " adds r2, r4\n" |
| 81 | + " adc r2, r2, #0\n" |
| 82 | + " ldr r4, [r0], #4\n" |
| 83 | + " adds r2, r4\n" |
| 84 | + " adc r2, r2, #0\n" |
| 85 | + " subs r1, r1, #8\n" |
| 86 | + " b 2$\n" |
| 87 | + |
| 88 | + // Sum up any remaining half-words. |
| 89 | + "3$:\n" |
| 90 | + // Make sure that we have more than 1 byte left to sum. |
| 91 | + " cmp r1, #2\n" |
| 92 | + " blt 7$\n" |
| 93 | + // Sum up next half word, continue to apply carry. |
| 94 | + " ldrh r4, [r0], #2\n" |
| 95 | + " adds r2, r4\n" |
| 96 | + " adc r2, r2, #0\n" |
| 97 | + " subs r1, r1, #2\n" |
| 98 | + " b 3$\n" |
| 99 | + |
| 100 | + // Handle trailing byte, if it exists |
| 101 | + "7$:\n" |
| 102 | + " cbz r1, 8$\n" |
| 103 | + " ldrb r4, [r0]\n" |
| 104 | + " adds r2, r4\n" |
| 105 | + " adc r2, r2, #0\n" |
| 106 | + |
| 107 | + // Fold 32-bit checksum into 16-bit checksum. |
| 108 | + "8$:\n" |
| 109 | + " ubfx r4, r2, #16, #16\n" |
| 110 | + " ubfx r2, r2, #0, #16\n" |
| 111 | + " adds r2, r4\n" |
| 112 | + " ubfx r4, r2, #16, #16\n" |
| 113 | + " ubfx r2, r2, #0, #16\n" |
| 114 | + " adds r2, r4\n" |
| 115 | + |
| 116 | + // Swap bytes if started at odd address |
| 117 | + " cbz r3, 9$\n" |
| 118 | + " rev16 r2, r2\n" |
| 119 | + |
| 120 | + // Return final sum. |
| 121 | + "9$: mov r0, r2\n" |
| 122 | + " pop {r4, pc}\n" |
| 123 | + ); |
| 124 | +} |
| 125 | + |
| 126 | +#endif |
0 commit comments