Nikolai Golovchenko says:

[The] debugged version: {ed: with correction from Vladyslav Borodavka Thank you!}
```
;***********************************************
; Square 10 bits
;
; 6 Aug 2000 by Nikolai Golovchenko
; Based on the original version of John Payson
;
; Input:
;  SrcH:SrcL
; Output:
;  DstH:DstM:DstL
;
; Instructions: 68
; Execution time(including return): 51+5*3+1=67
;
; Description:
;  The goal is calculation of Dst = Src^2
;
;  Src may be rewritten as:
;  	Src = SrcH*256 + SrcL
;  Let's introduce variables a, ah, al, and b:
;  	a = ah*16 + al = SrcL
;  	b = SrcH
;  Then
;	Src^2 = (256*b+a)^2=65536*b^2+512*b*a+a^2
;
;  Sqr4 routine calculates squares for 4 bit data
;  (b, al, and ah) using look-up table.
;
;  Now we can find b^2 by Sqr4 and multiply it by 65536,
;  and calculate 512*b*a.
;
;  To calculate a^2 let's expand it in byte halves:
;  a^2=(16*ah+al)^2=256*ah^2+32*ah*al+al^2
;
;  So,
;	Src^2=65536*b^2+512*b*a+256*ah^2+32*ah*al+al^2
;
;  This is the algorithm of how to square a 10 bit number
;  using Sqr4 look-up table.
;
;***********************************************
Sqr10
clrf DstH	;clear result
clrf DstM
clrf DstL
clrc		;clear carry
;find 32*ah*al
movf SrcL, w	;w = SrcL
andlw 0x0F
btfsc SrcL, 4
rrf DstM, f	;use carry after addition
rrf DstL, f
;DstM:DstL=ah<0>*al*256/2
btfsc SrcL, 5
rrf DstM, f
rrf DstL, f
;DstM:DstL=(ah<0>*al*256/2 + ah<1>*al*256)/2=64*al(ah<0>+2*ah<1>)
btfsc SrcL, 6
rrf DstM, f
rrf DstL, f
;DstM:DstL=32*al(ah<0>+2*ah<1>+4*ah<2>)
btfsc SrcL, 7
;DstM:DstL=32*al(ah<0>+2*ah<1>+4*ah<2>+8*ah<3>)=32*ah*al
;(maximum value = 0x1C20)

;Now add squared al and ah
call Sqr4
skpnc
incf DstM, f	;propagate carry to DstM

swapf SrcL, w
andlw 0x0F	;w = ah
call Sqr4
addwf DstM, f	;carry is reset

;At this point DstM:DstH contains a^2

;512*b*a=512*b*a=512*(2*b<1>+b<0>)*<128*a<7>+a<0:6>)=
;=65536*b*a<7>+256*(2*a<0:6>*b<0>+2*a<0:6>*b<1>+2*a<0:6>*b<0>)

movf SrcH, w	;Dst += 512*b*(128*a<7>) = 65536*b*a<7>
btfsc SrcL, 7
addwf DstH, f  ;carry is reset

rlf SrcL, w	;w = a<0:6>*2 (a<7> is already used)
clrc
btfsc SrcH, 0
addwf DstM, f	;Dst += 512*b<0>*a<0:6>
skpnc
incf DstH, f

clrc		;Dst += 512*b<1>*a<0:6>
btfsc SrcH, 1
skpnc
incf DstH, f

clrc		;Dst += 512*b<1>*a<0:6>
btfsc SrcH, 1
skpnc
incf DstH, f

movf SrcH, w
call Sqr4
retlw 0		; All done!
Sqr4:			;Look-up table for 4 bit squares
DT 0,1,4,9,16,25,36,49,64,81,100,121,144,169,196,225

;***********************************************

```

The original version:

```	clrf DstH
clrf DstM
clrf DstL
movf SrcL,w
andlw \$0F
btfss Src,4
rrf DstM
rrf DstL
btfss Src,5
rrf DstM
rrf DstL
btfss Src,6
rrf DstM
rrf DstL
btfss Src,7
call Sqr4
swapf SrcL
andlw \$0F
call Sqr4
; At this point, 16-bit result is in DstM:DstH
; 25 words of code prior to this point (plus a
; 17-word table-lookup). Total execution time:
; 35 cycles up to this point.
btfss SrcH,0
goto NoBit8
movf SrcL,w
btfsc C
incf DstH
btfsc C
incf DstH
incf DstH
; Another 9 words for bit 8; 3 or 9 cycles to exec.
NoBit8:
btfss SrcH,1
goto NoBit9
movlw 4
btfss SrcH,0
movlw 8
rlf SrcL,w
btfsc C
incf DstH
btfsc C
incf DstH
btfsc C
incf DstH
btfsc C
incf DstH
; Another 17 words for bit 9; 3 or 17 cycles to execute
; Total worst-case time: 35+26 = 61 cycles.
NoBit9:
retlw 0 ; All done!
Sqr4:
db 0,1,4,9,16,25,36,49,64,81,100,121,144,169,196,225
```

Questions:

• Hi guys, I'm not sure if i'm the one doing something wrong here, but the method for squaring a 10 bit number that is displayed on this page doesn't seem to work properly. The following function call:

movlw 0x03
movwf SrcH
movlw 0xFF
movwf SrcL
call Sqr10

gives me F3B45 as the square of 3FF... which is obviously wrong. (it should be FF801). Any help/comments ?+

