kernel_optimize_test/arch/c6x/lib/csum_64plus.S

; SPDX-License-Identifier: GPL-2.0-only
;
;  linux/arch/c6x/lib/csum_64plus.s
;
;  Port on Texas Instruments TMS320C6x architecture
;
;  Copyright (C) 2006, 2009, 2010, 2011 Texas Instruments Incorporated
;  Author: Aurelien Jacquiot (aurelien.jacquiot@jaluna.com)
;
#include <linux/linkage.h>

;
;unsigned int csum_partial_copy(const char *src, char * dst,
;				int len, int sum)
;
; A4:	src
; B4:	dst
; A6:	len
; B6:	sum
; return csum in A4
;

	.text
ENTRY(csum_partial_copy)
	MVC	.S2	ILC,B30

	MV	.D1X	B6,A31		; given csum
	ZERO	.D1	A9		; csum (a side)
||	ZERO	.D2	B9		; csum (b side)
||	SHRU	.S2X	A6,2,B5		; len / 4

	;; Check alignment and size
	AND	.S1	3,A4,A1
||	AND	.S2	3,B4,B0
	OR	.L2X	B0,A1,B0	; non aligned condition
||	MVC	.S2	B5,ILC
||	MVK	.D2	1,B2
||	MV	.D1X	B5,A1		; words condition
  [!A1]	B	.S1	L8
   [B0] BNOP	.S1	L6,5

	SPLOOP		1

	;; Main loop for aligned words
	LDW	.D1T1	*A4++,A7
	NOP	4
	MV	.S2X	A7,B7
||	EXTU	.S1	A7,0,16,A16
	STW	.D2T2	B7,*B4++
||	MPYU	.M2	B7,B2,B8
||	ADD	.L1	A16,A9,A9
	NOP
	SPKERNEL	8,0
||	ADD	.L2	B8,B9,B9

	ZERO	.D1	A1
||	ADD	.L1X	A9,B9,A9	;  add csum from a and b sides

L6:
  [!A1]	BNOP	.S1	L8,5

	;; Main loop for non-aligned words
	SPLOOP		2
 ||	MVK	.L1	1,A2

	LDNW	.D1T1	*A4++,A7
	NOP		3

	NOP
	MV	.S2X	A7,B7
 ||	EXTU	.S1	A7,0,16,A16
 ||	MPYU	.M1	A7,A2,A8

	ADD	.L1	A16,A9,A9
	SPKERNEL	6,0
 ||	STNW	.D2T2	B7,*B4++
 ||	ADD	.L1	A8,A9,A9

L8:	AND	.S2X	2,A6,B5
	CMPGT	.L2	B5,0,B0
  [!B0]	BNOP	.S1	L82,4

	;; Manage half-word
	ZERO	.L1	A7
||	ZERO	.D1	A8

#ifdef CONFIG_CPU_BIG_ENDIAN

	LDBU	.D1T1	*A4++,A7
	LDBU	.D1T1	*A4++,A8
	NOP		3
	SHL	.S1	A7,8,A0
	ADD	.S1	A8,A9,A9
	STB	.D2T1	A7,*B4++
||	ADD	.S1	A0,A9,A9
	STB	.D2T1	A8,*B4++

#else

	LDBU	.D1T1	*A4++,A7
	LDBU	.D1T1	*A4++,A8
	NOP		3
	ADD	.S1	A7,A9,A9
	SHL	.S1	A8,8,A0

	STB	.D2T1	A7,*B4++
||	ADD	.S1	A0,A9,A9
	STB	.D2T1	A8,*B4++

#endif

	;; Manage eventually the last byte
L82:	AND	.S2X	1,A6,B0
  [!B0]	BNOP	.S1	L9,5

||	ZERO	.L1	A7

L83:	LDBU	.D1T1	*A4++,A7
	NOP		4

	MV	.L2X	A7,B7

#ifdef CONFIG_CPU_BIG_ENDIAN

	STB	.D2T2	B7,*B4++
||	SHL	.S1	A7,8,A7
	ADD	.S1	A7,A9,A9

#else

	STB	.D2T2	B7,*B4++
||	ADD	.S1	A7,A9,A9

#endif

	;; Fold the csum
L9:	SHRU	.S2X	A9,16,B0
  [!B0]	BNOP	.S1	L10,5

L91:	SHRU	.S2X	A9,16,B4
||	EXTU	.S1	A9,16,16,A3
	ADD	.D1X	A3,B4,A9

	SHRU	.S1	A9,16,A0
   [A0]	BNOP	.S1	L91,5

L10:	ADD	.D1	A31,A9,A9
	MV	.D1	A9,A4

	BNOP	.S2	B3,4
	MVC	.S2	B30,ILC
ENDPROC(csum_partial_copy)

;
;unsigned short
;ip_fast_csum(unsigned char *iph, unsigned int ihl)
;{
;	unsigned int checksum = 0;
;	unsigned short *tosum = (unsigned short *) iph;
;	int len;
;
;	len = ihl*4;
;
;	if (len <= 0)
;		return 0;
;
;	while(len) {
;		len -= 2;
;		checksum += *tosum++;
;	}
;	if (len & 1)
;		checksum += *(unsigned char*) tosum;
;
;	while(checksum >> 16)
;		checksum = (checksum & 0xffff) + (checksum >> 16);
;
;	return ~checksum;
;}
;
; A4:	iph
; B4:	ihl
; return checksum in A4
;
	.text

ENTRY(ip_fast_csum)
	ZERO	.D1	A5
 ||	MVC	.S2	ILC,B30
	SHL	.S2	B4,2,B0
	CMPGT	.L2	B0,0,B1
  [!B1] BNOP	.S1	L15,4
  [!B1]	ZERO	.D1	A3

  [!B0]	B	.S1	L12
	SHRU	.S2	B0,1,B0
	MVC	.S2	B0,ILC
	NOP	3

	SPLOOP	1
	LDHU	.D1T1	*A4++,A3
	NOP	3
	NOP
	SPKERNEL	5,0
 ||	ADD	.L1	A3,A5,A5

L12:	SHRU	.S1	A5,16,A0
  [!A0]	BNOP	.S1	L14,5

L13:	SHRU	.S2X	A5,16,B4
	EXTU	.S1	A5,16,16,A3
	ADD	.D1X	A3,B4,A5
	SHRU	.S1	A5,16,A0
  [A0]	BNOP	.S1	L13,5

L14:	NOT	.D1	A5,A3
	EXTU	.S1	A3,16,16,A3

L15:	BNOP	.S2	B3,3
	MVC	.S2	B30,ILC
	MV	.D1	A3,A4
ENDPROC(ip_fast_csum)

;
;unsigned short
;do_csum(unsigned char *buff, unsigned int len)
;{
;	int odd, count;
;	unsigned int result = 0;
;
;	if (len <= 0)
;		goto out;
;	odd = 1 & (unsigned long) buff;
;	if (odd) {
;#ifdef __LITTLE_ENDIAN
;		result += (*buff << 8);
;#else
;		result = *buff;
;#endif
;		len--;
;		buff++;
;	}
;	count = len >> 1;		/* nr of 16-bit words.. */
;	if (count) {
;		if (2 & (unsigned long) buff) {
;			result += *(unsigned short *) buff;
;			count--;
;			len -= 2;
;			buff += 2;
;		}
;		count >>= 1;		/* nr of 32-bit words.. */
;		if (count) {
;			unsigned int carry = 0;
;			do {
;				unsigned int w = *(unsigned int *) buff;
;				count--;
;				buff += 4;
;				result += carry;
;				result += w;
;				carry = (w > result);
;			} while (count);
;			result += carry;
;			result = (result & 0xffff) + (result >> 16);
;		}
;		if (len & 2) {
;			result += *(unsigned short *) buff;
;			buff += 2;
;		}
;	}
;	if (len & 1)
;#ifdef __LITTLE_ENDIAN
;		result += *buff;
;#else
;		result += (*buff << 8);
;#endif
;	result = (result & 0xffff) + (result >> 16);
;	/* add up carry.. */
;	result = (result & 0xffff) + (result >> 16);
;	if (odd)
;		result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
;out:
;	return result;
;}
;
; A4:	buff
; B4:	len
; return checksum in A4
;

ENTRY(do_csum)
	   CMPGT   .L2	   B4,0,B0
   [!B0]   BNOP    .S1	   L26,3
	   EXTU    .S1	   A4,31,31,A0

	   MV	   .L1	   A0,A3
||	   MV	   .S1X    B3,A5
||	   MV	   .L2	   B4,B3
||	   ZERO    .D1	   A1

#ifdef CONFIG_CPU_BIG_ENDIAN
   [A0]    SUB	   .L2	   B3,1,B3
|| [A0]    LDBU    .D1T1   *A4++,A1
#else
   [!A0]   BNOP    .S1	   L21,5
|| [A0]    LDBU    .D1T1   *A4++,A0
	   SUB	   .L2	   B3,1,B3
||	   SHL	   .S1	   A0,8,A1
L21:
#endif
	   SHR	   .S2	   B3,1,B0
   [!B0]   BNOP    .S1	   L24,3
	   MVK	   .L1	   2,A0
	   AND	   .L1	   A4,A0,A0

   [!A0]   BNOP    .S1	   L22,5
|| [A0]    LDHU    .D1T1   *A4++,A0
	   SUB	   .L2	   B0,1,B0
||	   SUB	   .S2	   B3,2,B3
||	   ADD	   .L1	   A0,A1,A1
L22:
	   SHR	   .S2	   B0,1,B0
||	   ZERO    .L1	   A0

   [!B0]   BNOP    .S1	   L23,5
|| [B0]    MVC	   .S2	   B0,ILC

	   SPLOOP  3
	   SPMASK  L1
||	   MV	   .L1	   A1,A2
||	   LDW	   .D1T1   *A4++,A1

	   NOP	   4
	   ADD	   .L1	   A0,A1,A0
	   ADD	   .L1	   A2,A0,A2

	   SPKERNEL 1,2
||	   CMPGTU  .L1	   A1,A2,A0

	   ADD	   .L1	   A0,A2,A6
	   EXTU    .S1	   A6,16,16,A7
	   SHRU    .S2X    A6,16,B0
	   NOP		   1
	   ADD	   .L1X    A7,B0,A1
L23:
	   MVK	   .L2	   2,B0
	   AND	   .L2	   B3,B0,B0
   [B0]    LDHU    .D1T1   *A4++,A0
	   NOP	   4
   [B0]    ADD	   .L1	   A0,A1,A1
L24:
	   EXTU    .S2	   B3,31,31,B0
#ifdef CONFIG_CPU_BIG_ENDIAN
   [!B0]   BNOP    .S1	   L25,4
|| [B0]    LDBU    .D1T1   *A4,A0
	   SHL	   .S1	   A0,8,A0
	   ADD	   .L1	   A0,A1,A1
L25:
#else
   [B0]    LDBU    .D1T1   *A4,A0
	   NOP	   4
   [B0]    ADD	   .L1	   A0,A1,A1
#endif
	   EXTU    .S1	   A1,16,16,A0
	   SHRU    .S2X    A1,16,B0
	   NOP	   1
	   ADD	   .L1X    A0,B0,A0
	   SHRU    .S1	   A0,16,A1
	   ADD	   .L1	   A0,A1,A0
	   EXTU    .S1	   A0,16,16,A1
	   EXTU    .S1	   A1,16,24,A2

	   EXTU    .S1	   A1,24,16,A0
||	   MV	   .L2X    A3,B0

   [B0]    OR	   .L1	   A0,A2,A1
L26:
	   NOP	   1
	   BNOP    .S2X    A5,4
	   MV	   .L1	   A1,A4
ENDPROC(do_csum)

;__wsum csum_partial(const void *buff, int len, __wsum wsum)
;{
;	unsigned int sum = (__force unsigned int)wsum;
;	unsigned int result = do_csum(buff, len);
;
;	/* add in old sum, and carry.. */
;	result += sum;
;	if (sum > result)
;		result += 1;
;	return (__force __wsum)result;
;}
;
ENTRY(csum_partial)
	   MV	   .L1X    B3,A9
||	   CALLP   .S2	   do_csum,B3
||	   MV	   .S1	   A6,A8
	   BNOP    .S2X    A9,2
	   ADD	   .L1	   A8,A4,A1
	   CMPGTU  .L1	   A8,A1,A0
	   ADD	   .L1	   A1,A0,A4
ENDPROC(csum_partial)

;unsigned short
;ip_compute_csum(unsigned char *buff, unsigned int len)
;
; A4:	buff
; B4:	len
; return checksum in A4

ENTRY(ip_compute_csum)
	   MV	   .L1X    B3,A9
||	   CALLP   .S2	   do_csum,B3
	   BNOP    .S2X    A9,3
	   NOT	   .S1	   A4,A4
	   CLR     .S1	   A4,16,31,A4
ENDPROC(ip_compute_csum)