;/*
;*
;* Copyright (C) 2010 Texas Instruments Incorporated - http://www.ti.com/ 
;* 
;* 
;*  Redistribution and use in source and binary forms, with or without 
;*  modification, are permitted provided that the following conditions 
;*  are met:
;*
;*    Redistributions of source code must retain the above copyright 
;*    notice, this list of conditions and the following disclaimer.
;*
;*    Redistributions in binary form must reproduce the above copyright
;*    notice, this list of conditions and the following disclaimer in the 
;*    documentation and/or other materials provided with the   
;*    distribution.
;*
;*    Neither the name of Texas Instruments Incorporated nor the names of
;*    its contributors may be used to endorse or promote products derived
;*    from this software without specific prior written permission.
;*
;*  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
;*  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
;*  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
;*  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 
;*  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 
;*  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
;*  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
;*  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
;*  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
;*  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
;*  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;*
;*/




;****************************************************************************
;;; TEXAS INSTRUMENTS, INC.
;****************************************************************************

;****************************************************************************
;;; Name
;;;     crcAccum = crc32_xortree(crcAccum, byteCnt, hwData)
;;; Inputs
;;;     1. Current 32-bit CRC value.
;;;     2. Count of new input data bytes.
;;;     3. Pointer to 16-bit aligned input data.
;;;
;;; Outputs
;;;     1. Updated 32-bit CRC value.
;;;
;;; Description
;;; This function computes the 32-bit AAL5 CRC or SSTED CRC for ATM cells.
;;; The cell contents are stored in a buffer aligned to a 16-bit boundary.
;;; The CRC is computed assuming the MS bit of each 16-bit word is
;;; transmitted first. An odd byte at the end will be in the MS byte of the
;;; last word. The CRC32 computation uses the XOR tree method. Data is
;;; processed 32bits at a time whenever possible. The function is
;;; interruptable.
;;; Performance: 14 + 15 * #32-bit words (or fragments thereof).
;;;
;****************************************************************************

;    .sect ".text:fast"
LITTLE_ENDIAN     .set  1

    .global _crclib_crc32_calc_xortree
_crclib_crc32_calc_xortree:
              .if 0
; KELVIN ASSEMBLY              
;******************************************************************
;;; LOCAL VARIABLE OFFSETS AND REGISTER ASSIGNMENTS:
;******************************************************************

;;; Function entry register assignments

        .asg B3,  retAddr                         ; Return address register.
        .asg A4,  crcAccum                        ; CRC accumulator.
        .asg B4,  byteCnt                         ; Count of input bytes.
        .asg A6,  data                            ; Address of input bytes, can be circular.

;;; Function scratch register assignments

        .asg A1,  Xin                             ; Input 32-bit words
        .asg A2,  rs0                             ; CRC right shift result
        .asg A3,  rs1                             ; CRC right shift term
        .asg A5,  ls0                             ; CRC left shift result
        .asg A7,  tmp1_a                          ; Scratch register
        .asg A8,  tmp2_a                          ; Scratch register
        .asg A9,  tmp3_a                          ; Scratch register
        .asg A16, tmp4_a                          ; Scratch register
        .asg A17, const0                          ; 0 constant for packing.
        .asg A18, crcRes                          ; crc residue to XOR into final result
        .asg B0,  loopCnt                         ; B0 is the loop count condition register.
        .asg B2,  ls1                             ; CRC left shift term
        .asg B5,  tmp1_b                          ; Scratch register
        .asg B6,  tmp2_b                          ; scratch register


;******************************************************************
;;; FUNCTION SETUP:
;;; 1. Load first non-aligned word. Check for 32-bit boundary.
;;; 2. If not 32-bit aligned, right-shift crcAccum 16 bits and left
;;;    shift crcAccum 16 bits (crc residue) to XOR into 1st partial
;;;    result. Use just the lower inoput half-word (HW).
;;; 3. Compute a mask and right shift for the trailing fragment.
;;; 4. The loop count is the byte count converted to words. A
;;;    leading frag. adds one iteration. A trailing frag. does not
;;;    add an iteration. Formula: (byteCnt + (data & 3)) >> 2
;;; 5. Increment data pointer by 1 word and round down to word
;;;    boundary.
;******************************************************************

        ZERO    .S1  const0                       ; Form zero for packing.
||      AND     .L1  3, data, A0                  ; Form byte address mod 4.
||      LDNW    .D1  *data++, Xin                 ; Get 32 bits of input data.

   [A0] SHRU    .S1  crcAccum, 16, crcAccum       ; If non-aligned, shift crcAccum right 16 bits.
|| [A0] PACK2   .L1  crcAccum, const0, crcRes     ; If non-aligned, save lower word as crc residue.
||[!A0] ZERO    .D1  crcRes                       ; If aligned CRC residue is 0.

        AND     .S1  -4, data, data               ; Round address pointer down to word boundary.
||      ADD     .L2X byteCnt, A0, B0              ; loopCnt = byteCnt + (data & 3)
        
        SHRU    .S2  loopCnt, 2, B0               ; loopCnt = (byteCnt + (data & 3) >> 2
||      AND     .D2  B0, 3, B1                    ; Trailing # bytes is (byteCnt + data) & 3
        
        SUB     .L2  loopCnt, 1, B0               ; loopCnt -= 1.

  [!A0] SWAP2   .S1  Xin, Xin                     ; If aligned swap high and low HW.
|| [A0] PACK2   .L1  const0, Xin, Xin             ; If non-aligned use just low HW.

crc32_lastpass: 
        SHRU    .S1  Xin, 9, rs1                  ; Form part of rs0 >> 9.

;******************************************************************
;;; LOOP KERNEL:
;;; The loop kernel is 15 cycles.
;******************************************************************
crc32Loop:

;;; Compute the XOR of right shift terms:
;;; rs0 = Xin ^ crcAccum
;;; rs1 = (rs0 >> 9) ^ (rs0 >> 10)
;;; rs0 = rs0 ^ (rs0 >> 6) ^ (rs0 >> 12) ^ (rs0 >> 16) ^ (rs0 >> 24) ^
;;;       rs1 ^ (rs1 >> 16) ^ (rs1 >> 19) ^ (rs1 >> 21)

        SHRU    .S1  crcAccum, 9, tmp1_a          ; Form 2nd part of rs0 >> 9.
||      XOR     .L1  Xin, crcAccum, rs0           ; rs0 = Xin ^ crcAccum
        
        SHRU    .S1  rs0, 10, tmp1_a              ; rs0 >> 10
||      PACKH2  .L1  const0, rs0, tmp2_a          ; rs0 >> 16
||      XOR     .D1  tmp1_a, rs1, rs1             ; rs0 >> 9

        SHRU    .S1  rs0, 6, tmp1_a               ; rs0 >> 6
||      SHLMB   .L1  rs0, const0, tmp3_a          ; rs0 >> 24
||      XOR     .D1  rs1, tmp1_a, rs1             ; (rs0 >> 9) ^ (rs0 >> 10)
||      SHRU    .S2X rs0, 12, tmp1_b              ; rs0 >> 12
||      MV      .L2X rs0, tmp2_b                  ; rs0 to B side

;;; At this point rs0, rs1, rs0>>6, rs0>>12, rs0>>16 and rs0>>24 are
;;; available in rs0, rs1, tmp1_a, tmp1_b, tmp2_a and tmp3_a.
        
        SHRU    .S1  rs1, 19, tmp4_a              ; rs1 >> 19
||      PACKH2  .L1  const0, rs1, rs0             ; rs1 >> 16
||      XOR     .D1  rs1, tmp1_a, tmp1_a          ; rs1 ^ (rs0 >> 6)
||      XOR     .S2  tmp1_b, tmp2_b, tmp1_b       ; rs0 ^ (rs0 >> 12)
||      MV      .L2X tmp2_a, tmp2_b               ; rs0 >> 16 to B side

        XOR     .S1  rs0, tmp4_a, rs0             ; (rs1 >> 16) ^ (rs1 >> 19)
||      XOR     .L1  tmp1_a, tmp3_a, tmp1_a       ; rs1 ^ (rs0 >> 6) ^ (rs0 >> 24)
||      SHRU    .S2X rs1, 21, tmp2_b              ; rs1 >> 21
||      XOR     .L2  tmp1_b, tmp2_b, tmp1_b       ; rs0 ^ (rs0 >> 12) ^ (rs0 >> 16)

        XOR     .L1  rs0, tmp1_a, rs0             ; rs1 ^ (rs0 >> 6) ^ (rs0 >> 24) ^ (rs1 >> 16) ^ (rs1 >> 19)
||      XOR     .L2  tmp1_b, tmp2_b, tmp1_b       ; rs0 ^ (rs0 >> 12) ^ (rs0 >> 16) ^ (rs1 >> 21)

        NOP                                       ; Settle the cross path.

        XOR     .L1X rs0, tmp1_b, rs0             ; rs0 final result on A side.
||      XOR     .L2X tmp1_b, rs0, tmp1_b          ; rs0 final result on B side.


;;; Compute left shift terms
;;; ls0 = rs0 ^ (rs0 << 4)
;;; ls1 = (rs0 << 1) ^ (rs0 << 2) ^ (rs0 << 5)
;;; LS0 = (RS0 << 10) ^ LS0 ^ (LS0 << 12) ^ LS1 ^ (LS1 << 6) ^ (LS1 << 21) ^ crcRes

        SHL     .S1  rs0, 4, ls0                  ; rs0 << 4
|| [B0] LDW     .D1  *data++, Xin                 ; Get next input data word.
||      SHL     .S2  tmp1_b, 5, tmp2_b            ; rs0 << 5
||      ADD     .L2  tmp1_b, tmp1_b, tmp1_b       ; rs0 << 1

        SHL     .S1  rs0, 10, rs0                 ; rs0 << 10
||      XOR     .L1  rs0, ls0, ls0                ; ls0 = rs0 ^ (rs0 << 4)
|| [B0] B       .S2  crc32Loop                    ; Loop if more input data left
||      XOR     .L2  tmp1_b, tmp2_b, tmp1_b       ; (rs0 << 1) ^ (rs0 << 5)
||      ADD     .D2  tmp1_b, tmp1_b, ls1          ; rs0 << 2

        SHL     .S1  ls0, 12, ls0                 ; ls0 << 12
||      XOR     .L1  rs0, ls0, rs0                ; (rs0 << 10) ^ ls0
||      SUB     .L2  loopCnt, 1, loopCnt          ; Decrement loop count
||      XOR     .D2  tmp1_b, ls1, ls1             ; ls1 = (rs0 << 1) ^ (rs0 << 2) ^ (rs0 << 5)

        XOR     .L1  ls0, rs0, ls0                ; (rs0 << 10) ^ ls0 ^ (ls0 << 12)
||      SHL     .S2  ls1, 6, tmp1_b               ; ls1 << 6

        SHL     .S1X ls1, 21, tmp1_a              ; ls1 << 21
||      XOR     .L1  ls0, crcRes, ls0             ; (rs0<<10) ^ ls0 ^ (ls0<<12) ^ crcRes
||      XOR     .L2  ls1, tmp1_b, ls1             ; ls1 ^ (ls1 << 6)

        SWAP2   .S1  Xin, Xin                     ; Swap high and low HW.
||      XOR     .L1  ls0, tmp1_a, ls0             ; (rs0<<10) ^ ls0 ^ (ls0<<12) ^ (ls1<<21) ^ crcRes
         
        SHRU    .S1  Xin, 9, rs1                  ; Form part of rs0 >> 9.
||      XOR     .L1X ls0, ls1, crcAccum           ; Form final CRC value.
||      ZERO    .D1  crcRes                       ; crc Residue is 0 if not 1st pass.

;******************************************************************
;;; TRAILING BYTES and FUNCTION EXIT:
;;; 1. If trailing bytes, branch to top of loop. Otherwise, exit.
;;; 2. If trailing bytes, load last word.
;;; 3. If trailing bytes, adjust crcAccum with right shift.
;;; 4. crcRes is crcAccum left shifted.
;;; 5. Shift out unused bytes in last word.
;******************************************************************
;;; Condition reg B1 is # trailing bytes.
        SHL     .S1X B1, 3, tmp1_a                ; Left shift = trailing bytes * 8.
||      SUB     .L1X 4, B1, tmp2_a                ; right shift = (4 - trailing bytes) * 8.
|| [B1] LDW     .D1  *data, Xin                   ; Get data word with trailing bytes.
        
   [B1] B       .S1  crc32_lastpass               ; Go back to top if trailing bytes.
||[!B1] B       .S2  B3                           ; Exit if no trailing bytes.

        SHL     .S1  tmp2_a, 3, tmp2_a            ; right shift amount.

        SHL     .S1  crcAccum, tmp1_a, crcRes     ; crc residue is left shift of crcAccum.

   [B1] SHRU    .S1  crcAccum, tmp2_a, crcAccum   ; Adjust crcAccum with right shift.
||      ZERO    .L2  B1                           ; Zero trailing byte count.
||      ZERO    .D2  B0                           ; Zero loop counter.
        
        SWAP2   .S1  Xin, Xin                     ; Swap high and low HW of inout word.
        
        SHRU    .S1  Xin, tmp2_a, Xin             ; Shift out unused bytes.
        
           .endif

;****************************************************************************
;;; Name
;;;     crcAccum = crc32_gmpy_ii2(dataPtr, crcAccum, hwCnt)
;;; Inputs
;;;     1. Pointer to input data.
;;;     2. Current 32-bit CRC value.
;;;     3. Count of new input data in halfwords.
;;;
;;; Outputs
;;;     1. Updated 32-bit CRC value.
;;;
;;; Description
;;; This function updates the 32-bit CCITT CRC, given an inital CRC value and
;;; an array of input halfwords. The main loop processes 4*L bytes at a time
;;; using the Galois field multiply instruction.
;;; A lookup table of "powers" of X in the Galois field is used. This table
;;; 4*L words long. Main loop performance is 3 + L/2 cycles, so a longer table
;;; will better amortize those 3 extra cycles. Table length is a compile time
;;; choice.
;;; The CRC is computed assuming the MS bit of each byte is transmitted
;;; first. The function can be compiled for little endian or big endian. The
;;; endian dependencies are the byte extraction constants and the power table
;;; layout.
;;; The function is interruptable.
;;; Performance: For N 16-bit input words, N even
;;; 28 + N + 3 * ceil(N/2*L)
;;; One last odd 16-bit word is 3 extra cycles.
;;; Example1: L = 6. 48 input bytes takes 58 cycles
;;; Example2: L = 6. 256 input bytes takes 189 cycles
;****************************************************************************

;;; Constant definitions


;;; Constants to compute the number of passes and loop count per pass.
LOOPWORDS         .set  6
INV_LOOPWORDS_X2  .set  ((32767 + LOOPWORDS) / LOOPWORDS)
ROUNDUP           .set  (65537 * (LOOPWORDS - 1) / LOOPWORDS)

;;; Constants to extract bytes correctly in either endian case.
        .if  LITTLE_ENDIAN
EXT0_CONST        .set  (24 << 5) | 24
EXT1_CONST        .set  (16 << 5) | 24
EXT2_CONST        .set  ( 8 << 5) | 24
EXT3_CONST        .set  ( 0 << 5) | 24
        .else
EXT0_CONST        .set  ( 0 << 5) | 24
EXT1_CONST        .set  ( 8 << 5) | 24
EXT2_CONST        .set  (16 << 5) | 24
EXT3_CONST        .set  (24 << 5) | 24
        .endif

EXT0_CRC_CONST    .set  (24 << 5) | 24
EXT1_CRC_CONST    .set  (16 << 5) | 24
EXT2_CRC_CONST    .set  ( 8 << 5) | 24
EXT3_CRC_CONST    .set  ( 0 << 5) | 24



         .sect ".data"
        
;;; Align this table on a cache-line boundary.
        .align 128
crc32_pwrTbl:
        .if  LITTLE_ENDIAN
        .word 0x04C11DB7                          ; X^32 mod P
        .word 0x01D8AC87                          ; X^48 mod P
        .word 0xD219C1DC                          ; X^40 mod P
        .word 0xDC6D9AB7                          ; X^56 mod P
        .word 0x490D678D                          ; X^64 mod P
        .word 0x4F576811                          ; X^80 mod P
        .word 0x1B280D78                          ; X^72 mod P
        .word 0x5BA1DCCA                          ; X^88 mod P
        .word 0xF200AA66                          ; X^96 mod P
        .word 0xF9AC87EE                          ; X^112 mod P
        .word 0x8090A067                          ; X^104 mod P
        .word 0x07F6E306                          ; X^120 mod P
        .word 0xE8A45605                          ; X^128 mod P
        .word 0xDD0FE172                          ; X^144 mod P
        .word 0x47F7CEC1                          ; X^136 mod P
        .word 0x2FB7BF3A                          ; X^152 mod P
        .word 0x17D3315D                          ; X^160 mod P
        .word 0x0A1B8859                          ; X^176 mod P
        .word 0x8167D675                          ; X^168 mod P
        .word 0x34028FD6                          ; X^184 mod P
        .word 0xC5B9CD4C                          ; X^192 mod P
        .word 0x064C29D0                          ; X^208 mod P
        .word 0xF382B7F2                          ; X^200 mod P
        .word 0x56AF9DB2                          ; X^216 mod P
        .else
        .word 0x01D8AC87                          ; X^48 mod P
        .word 0x04C11DB7                          ; X^32 mod P
        .word 0xDC6D9AB7                          ; X^56 mod P
        .word 0xD219C1DC                          ; X^40 mod P
        .word 0x4F576811                          ; X^80 mod P
        .word 0x490D678D                          ; X^64 mod P
        .word 0x5BA1DCCA                          ; X^88 mod P
        .word 0x1B280D78                          ; X^72 mod P
        .word 0xF9AC87EE                          ; X^112 mod P
        .word 0xF200AA66                          ; X^96 mod P
        .word 0x07F6E306                          ; X^120 mod P
        .word 0x8090A067                          ; X^104 mod P
        .word 0xDD0FE172                          ; X^144 mod P
        .word 0xE8A45605                          ; X^128 mod P
        .word 0x2FB7BF3A                          ; X^152 mod P
        .word 0x47F7CEC1                          ; X^136 mod P
        .word 0x0A1B8859                          ; X^176 mod P
        .word 0x17D3315D                          ; X^160 mod P
        .word 0x34028FD6                          ; X^184 mod P
        .word 0x8167D675                          ; X^168 mod P
        .word 0x064C29D0                          ; X^208 mod P
        .word 0xC5B9CD4C                          ; X^192 mod P
        .word 0x56AF9DB2                          ; X^216 mod P
        .word 0xF382B7F2                          ; X^200 mod P
        .endif        


    .sect ".text:fast"

;******************************************************************
;;; LOCAL VARIABLE OFFSETS AND REGISTER ASSIGNMENTS:
;******************************************************************

;;; Function entry register assignments

        .asg B3,  retAddr                         ; Return address register.
        .asg A4,  dataPtr                         ; Address of input bytes, can be circular.
        .asg B4,  crcAccum_b                      ; CRC accumulator.
        .asg A6,  hwCnt                           ; Count of input bytes.

        .asg A1,  thw                             ; Trailing half word indicator
        .asg A2,  p1st                            ; 1st outter loop pass detector
        .asg A4,  crcOut
        .asg A5,  pwrPtr_a
        .asg A6,  crcAccum_a
        .asg A7,  w0123
        .asg A8,  term0_a
        .asg A9,  term2_a
        .asg A16, b0in_a
        .asg A17, b2in_a
        .asg A18, crc0_a
        .asg A19, crc2_a
        .asg A20, incPwr2_a
        .asg A21, incPwr0_a
        .asg A22, pwr2_a
        .asg A23, pwr0_a
        .asg A24, crc0_14
        .asg A25, crcUpd_a
        .asg A26, pwrPtrReset
        .asg A27, lastByteInx
        .asg A27, lastByte0
        .asg A28, X40_a
        .asg A29, X56_a
        .asg A30, eByte0
        .asg A31, eByte2

        .asg B0,  wrdCnt                          ; Number of full input words
        .asg B1,  w1p                             ; Words to process in first pass
        .asg B2,  olc                             ; Outer loop count for SPLOOP reload
        .asg B5,  pwrPtr_b                        ; B Side power table pointer
;        .asg B6,
        .asg B7,  loopWords                       ; Number of input words to process in one pass.
        .asg B8,  term1_b
        .asg B8,  invLoopWords_x2                 ; 1 / (2 * loopWords) to compute outer loop count
        .asg B9,  term3_b
        .asg B16, hwCnt_m2
        .asg B16, roundWords                      ; wrdCnt-1 rounded down to multiple of loop words
        .asg B16, b1in_b
        .asg B17, pwrInx1p
        .asg B17, b3in_b
        .asg B18, olc_m1                          ; Outer loop count - 1 computed from division
        .asg B18, crc1_b
        .asg B19, crc3_b
        .asg B20, incPwr3_b
        .asg B21, incPwr1_b
        .asg B22, pwr3_b
        .asg B23, pwr1_b
        .asg B24, crc1_15
        .asg B25, crcUpd_b
        .asg B27, lastByte1
        .asg B28, X32_b
        .asg B29, X48_b
        .asg B30, eByte1
        .asg B31, eByte3
        
;******************************************************************
;;; FUNCTION SETUP:
;;; 1. Set the polynomial for the GMPY instruction to the 32-bit
;;;    CRC polynomial.
;;; 2. Set up 1st pass of processing. If there are N words, then
;;;    the first pass processes mod(N,LOOPWORDS) words.
;;; 3. Set up all subsequent passes. Each subsequent pass
;;;    processes LOOPWORDS words.
;******************************************************************

;;; Set p1st, olc, ILC, RILC, pwrPtr_b, pwrPtrReset
;;; incPwr0-4, zero crc0_14 and crc1_15
        
        MVKL crc32_pwrTbl, pwrPtrReset
||      MVK  INV_LOOPWORDS_X2, invLoopWords_x2    ; (words per half-word)/(words per pass), Q16
||      ZERO crc0_14
||      SUB  hwCnt, 2, hwCnt_m2                   ; hwCount - 2 to get outer loop count - 1.
        
        MVKH crc32_pwrTbl, pwrPtrReset            ; Form address of table of X^(32+8*N)
||      SHR  hwCnt, 1, wrdCnt                     ; Form count of full words
||      AND  1, hwCnt, thw                        ; 1 if trailing half-word to process
||      MPYU invLoopWords_x2, hwCnt_m2, olc_m1    ; floor((2*wrdCnt-2) * (0.5 / loopWords)) = outer loop count - 1
||      ZERO crc1_15
        
        MVK  1, p1st                              ; Arm first pass detect condition Reg. with 1
||      MVK  LOOPWORDS, loopWords                 ; Loop words is # input words processed per pass
||      LDDW  *pwrPtrReset, X48_b:X32_b           ; Get X^32, the CRC polynomial, from power table

        ADDK 16*LOOPWORDS-16, pwrPtrReset         ; Start value for power table ptr
||      PACKH2 crc1_15, olc_m1, olc_m1            ; 16 MSBs is outer loop count minus 1
||      MPYHL olc_m1, loopWords, roundWords       ; (wrdCnt - 1) rounded down to multiple of loop words
||      MV   pwrPtrReset, pwrPtr_b                ; Power table address to B side
||      LDDW *+pwrPtrReset[1], X56_a:X40_a        ; Get X^40 and X^56 from power table
||      ADD  hwCnt, hwCnt, lastByteInx            ; Form index of 2nd to last byte.

 [!wrdCnt] B    crc32_finish                      ; Just process trailing half word if no full words
||      MVC  loopWords, RILC                      ; Set SPLOOP iteration count for normal pass
||      SUB  lastByteInx, 1, lastByteInx          ; Form index of last trailing byte

        MVK  EXT1_CONST, eByte1
||      MVK  EXT0_CONST, eByte0                   ; Get endian dependent byte extract constants
||      SUB  wrdCnt, roundWords, w1p              ; Words 1st pass is # words minus rounded words
||      MV   crcAccum_b, crcAccum_a
||[thw] LDBU *+dataPtr[lastByteInx], lastByte1    ; Get last trailing byte

        MVC  w1p, ILC                             ; Set SPLOOP iteration count for 1st pass
||      ADD  1, olc_m1, olc                       ; Finish outer loop count computation.
||      SUB  w1p, 1, pwrInx1p                     ; Form power table index for 1st pass.
||      SUB  lastByteInx, 1, lastByteInx          ; Form index of 2nd to last byte
        
        MVK  EXT2_CONST, eByte2                   ; Get endian dependent byte extract constants
||      MVK  EXT3_CONST, eByte3
||      ADD  pwrInx1p, pwrInx1p, pwrInx1p         ; Adjust index by 2 doublewords per loop word
||[thw] LDBU *+dataPtr[lastByteInx], lastByte0    ; Get 2nd to last trailing byte
        
        MVC X32_b, GPLYA                          ; Set GMPY polynomial for side A
||[wrdCnt] LDDW *++pwrPtr_b[pwrInx1p], incPwr1_b:incPwr3_b ; Get powers to bring CRC forward
       
        MVC X32_b, GPLYB                          ; Set GMPY polynomial for side B
||      ADD  8, pwrPtr_b, pwrPtr_a                ; Position A-side ptr 2 Words past B-side Ptr
||[wrdCnt] LDDW *+pwrPtr_b[1], incPwr0_a:incPwr2_a ; Get powers to bring CRC forward
       
;******************************************************************
;;; LOOP KERNEL:
;;; 1. Load the next word and the next 2 powers.
;;; 2. Extract 4 bytes from the word and GMPY by appropriate powers
;;;    of X.
;;; 3. XOR all GMPY results together to get new CRC contribution.
;******************************************************************
        
 [olc]  SPLOOP 2
||      ADD  8, pwrPtr_b, pwrPtr_a                ; Position A-side ptr 2 Words past B-side Ptr
        
;;; Stage 0 --------------------------------------

        SPMASK
||^     SUB   olc, 1, olc                         ; Decrement outer loop count before 1st reload.
||      LDNW  *dataPtr++, w0123

        LDDW *pwrPtr_a--[2], pwr0_a:pwr2_a        ; Get powers to for input bytes
||      LDDW *pwrPtr_b--[2], pwr1_b:pwr3_b

;;; Stages 1 & 2 ---------------------------------

        NOP 3

        EXTU w0123, eByte0, b0in_a                ; Extract bytes 0 and 1, byte 0 1st in time
||      EXTU w0123, eByte1, b1in_b
        
;;; Stage 3 --------------------------------------

        EXTU w0123, eByte2, b2in_a                ; Extract bytes 2 and 3
||      EXTU w0123, eByte3, b3in_b
||      GMPY pwr0_a, b0in_a, term0_a              ; Compute CRC contribution of bytes 0 and 1
||      GMPY pwr1_b, b1in_b, term1_b

        GMPY pwr2_a, b2in_a, term2_a              ; Compute CRC contribution of bytes 2 and 3
||      GMPY pwr3_b, b3in_b, term3_b

;;; Stage 4 --------------------------------------

        NOP 2

;;; Stage 5 --------------------------------------

        XOR .L1 term0_a, crc0_14, crc0_14         ; XOR in CRC contribution of bytes 0 and 1
||      XOR .L2 term1_b, crc1_15, crc1_15

        SPKERNEL 0,0
||      XOR .L1 term2_a, crc0_14, crc0_14         ; XOR in CRC contribution of bytes 2 and 3
||      XOR .L2 term3_b, crc1_15, crc1_15

;******************************************************************
;;; LOOP RELOAD SETUP:
;;; 1. Bring existing CRC forward by multiplying by X^(32+8*4*L)
;;; 2. XOR new CRC contribution with existing CRC brought
;;;    forward.
;;; 3. Reset powers table pointer for next pass.
;******************************************************************
        
outerLoop:
        
        ADD  8, pwrPtrReset, pwrPtr_a             ; Point back to highest power used
||      MV   pwrPtrReset, pwrPtr_b

  [!p1st] LDDW *pwrPtr_a, incPwr0_a:incPwr2_a     ; Get powers to bring CRC forward
||[!p1st] LDDW *pwrPtr_b, incPwr1_b:incPwr3_b     ; Get powers to bring CRC forward
        
        SPMASKR                                   ; Start reloading SPLOOP for processing of next LOOPWORDS
||      SUB p1st, 1, p1st                         ; Update 1st pass detect condition
||      MV   crcAccum_b, crcAccum_a

        NOP 2

        EXTU crcAccum_a, 0, 24, crc0_a           ; Extract intermediate CRC bytes
||      EXTU crcAccum_b, 8, 24, crc1_b

        EXTU crcAccum_a, 16, 24, crc2_a
||      EXTU crcAccum_b, 24, 24, crc3_b
||      GMPY incPwr0_a, crc0_a, crc0_a            ; Bring intermediate CRC forward by X^(32 * LOOPWORDS)
||      GMPY incPwr1_b, crc1_b, crc1_b

        GMPY incPwr2_a, crc2_a, crc2_a
||      GMPY incPwr3_b, crc3_b, crc3_b

 [olc]  B    outerLoop                            ; Stop post-spoop program fetch after delay slots

 [olc]  SUB  olc, 1, olc                          ; Update outer loop count

        XOR crc0_14, crc0_a, crc0_a
||      XOR crc1_15, crc1_b, crc1_b
        
        XOR crc2_a, crc0_a, crc0_a
||      XOR crc3_b, crc1_b, crcAccum_b
        
        NOP                

        SPMASK L1, L2
||      MV   term0_a, crc0_14                     ; Overwrite 1st XOR contribution with a move to
||      MV   term1_b, crc1_15                     ; initialize the crc0_14 and crc1_15 locations.
||      XOR  crcAccum_b, crc0_a, crcAccum_b       ; Update CRC with intermediate CRC contribution

;******************************************************************
;;; TRAILING Halfword and FUNCTION EXIT:
;;; 1. If odd halfword at the end, process it.
;******************************************************************
        
crc32_finish:
 [!thw]  B   B3                                   ; Return if no trailing half-word
||      MVK  EXT3_CRC_CONST, eByte2               ; Constant to get byte 3 of CRC.

        EXTU crcAccum_b, eByte2, crc2_a           ; Extract CRC bytes 2 and 3 (byte 3 must be A side).
||      EXTU crcAccum_b, 8, 24, crc3_b
||      MV   crcAccum_b, crcOut                   ; Move updated CRC to return value register.

        XOR  crc2_a, lastByte0, crc2_a            ; XOR with trailing bytes 0 and 1
||      XOR  crc3_b, lastByte1, crc3_b
        
        GMPY X40_a, crc2_a, crc2_a                ; Compute CRC contribution of the 2 bytes
||      GMPY X32_b, crc3_b, crc3_b
||[thw] BNOP B3, 2

        SHL  crcOut, 16, crcOut                   ; Position CRC bytes 0 and 1 in upper 16 bits.

        XOR  crc2_a, crcOut, crcOut               ; XOR in trailing byte 1 contribution

        XOR  crcOut, crc3_b, crcOut               ; XOR in trailing byte 0 contribution

           
;;; Nothing past this point
