TITLE FSCOPY - Fast String Copy
       SUBTTL Courtesy of KLH
       SEARCH MACSYM

;;;AC's

A=:1                            ;Source BP
B=:2                            ;Temp
C=:3                            ;Byte count
D=:4                            ;Local AC's follow
E=:5
T=:6
U=:7
V=:10
W=:11
L=:12
M=:13
N=:14
O=:15                           ;Destination BP

KLWINC==^D18                    ;# chars at which hairy word move starts
                               ; winning over BP loop

$STENT==1                       ;offset from beg of loop for entry to
                               ; STORE phase
$GENT==4                        ;offset from beg of loop for entry to GET phase

       .PSECT DATA

FSCPKL: 0                       ;a LSHC A,<n> for SHIFT-IN goes here
       LSH A,1
       MOVEM A,(C)             ;Address of dest stored in RH here
       0                       ;a LSHC A,<n> for SHIFT-OUT goes here
       MOVE B,(C)              ;Address of source stored in RH here
       AOBJN C,FSCPKL
       JRST @FENTRM(D)

       .ENDPS
       .PSECT CODE

FSCOPY::CAIL C,KLWINC           ;Less than break-even point?
       IFSKP.
         DO.                   ;Yes, can't win
           ILDB B,A            ;Simple byte-by-byte copying.
           IDPB B,O
           SOJG C,TOP.
         ENDDO.
         RET
       ENDIF.

;;;Wheee, using hairy word copying!

       STKVAR <<TMPACS,20>>
       HRLI B,D                ;Save AC's D-O
       HRRI B,D+TMPACS
       BLT B,O+TMPACS

       LDB E,[POINT 3,A,5]     ;Get low 3 bits of P field for source
       SKIPGE E,FSCHTB(E)      ;Get resulting # chars, skip if addr ok
        MOVEI A,1(A)           ;P= 01, must bump address.
       MOVEI L,1(A)            ;Anyway, get addr+1 into 12.
       LDB D,[POINT 3,O,5]     ;Repeat procedure for dest
       SKIPGE D,FSCHT2(D)      ;Using slightly different table
        AOSA V,O               ;And addr goes into 10
         MOVEI V,(O)           ;And isn't normally bumped.
       MOVEI O,(C)             ;Update the destination pointer in o
       ADJBP O,O+TMPACS        ;From initial value
;Now get index for shift values, and count for words
       SUBI C,(E)              ;Get # chars minus those in 1st src wd.
       ADDI E,-6(D)            ;Get E index - D*5+S, zero based.
       IDIVI C,5               ;Find # words to loop through, rem in d
       MOVE B,-1(L)            ;And get 1st word of source.
       JRST @FPATH(E)          ;Must now pick a path...

;Blt possible!  Jump to FSBLT0 if no shifting needed for setup.

FSBLT0: MOVEM B,(V)             ;Store source word directly
       JRST FSBLT4
FSBLT:  LSH B,@SHASL(E)         ;Shift source up against left
       MOVE A,(V)              ;Get 1st wd of dest.
       LSH A,@SHADR(E)         ;Right-adjust it
       LSHC A,@SHFIX(E)        ;And get everything into A.
       LSH A,1                 ;Need one more bit's worth.
       MOVEM A,(V)             ;Store 1st wd of dest...

;Now settle down to serious BLT'ing.

FSBLT4: MOVEI T,(C)             ;Transfer word count
       ADDI T,(V)              ;Find addr of last dest word
       MOVEI V,1(V)            ;Now get 1st dest addr,
       HRLI V,(L)              ;And put 1st source addr in LH.
       BLT V,(T)               ;Zoom!!
       JUMPE D,FSCPY9          ;If no remainder, super win - done!
       ADDI L,(C)              ;Hmm, must get last source word.
       MOVE B,(L)              ;Like so.
       MOVE A,FBMSK(D)         ;And a word mask for chars
       AND B,A                 ;Clear unused bits from source,
       ANDCAM A,1(T)           ;And zap target bits in dest.
       IORM B,1(T)             ;And stick last chars in.
       JRST FSCPY9             ;Ok, all done...

;Can't do BLT.  Well, get A and B set up for magical shift loop.

SHSKP2: LSH B,@SHASL(E)         ;Here, only need to adjust source,
       JRST SHSKP5             ;Since dest will be totally clobbered.
FSSHFT: LSH B,@SHASL(E)         ;Both src and dest must be integrated.
SHSKP1: MOVE A,(V)              ;Only need adjust dest; src wd is full.
       LSH A,@SHADR(E)
SHSKP5: LSHC A,@SHFIX(E)        ;Stuff as many chars as possible into A
       CAIE D,0                ;If any remainder,
        MOVEI C,1(C)           ;Add 1 more word.
       MOVNI C,(C)             ;Make AOBJN pointer.
       MOVSI C,(C)

;Now make another index for termination wrapup purposes.

       ADD D,FFINDX(E)         ;New idx using # chs left in last wd.

;Now set things up for loop, and enter it.

FSCPY3: HRLI V,(<MOVEM A,(C)>)
       MOVEM V,FSCPKL+2        ;Address for MOVEM
       HRRM L,FSCPKL+4         ;Address for MOVE
       MOVE L,FSHINT(E)        ;Get LSH for shift-in
       MOVEM L,FSCPKL
       MOVE L,FSHOUT(E)        ;And shift-out
       MOVEM L,FSCPKL+3
       JUMPGE D,FSCPKL+$STENT  ;Depending on flag, enter loop at store
       SOS V,FSCPKL+2
       JRST FSCPKL+$GENT       ;Or at get.

;---------------------------------------------------------------------------
;Come here when loop finished.  The last word of the source string
;will be in B.  It may have 1 to 5 chars left for moving, but will
;never have 0.

;Long wrapup.

FSCPTL: LSHC A,@FSCPKL          ;Perform a shift-in
       LSH A,1
       MOVEM A,@10             ;Store full word.
       MOVEI C,1(C)            ;Increment address index
;And drop through to Medium wrapup.

;Medium wrapup.

FSCPTM: LSHC A,@FLOUT(D)        ;Shift rest of source word into A
       MOVE B,@10              ;Get dest word it will be stored into
       LSH B,@FLADJ(D)         ;Left-adjust chars to preserve.
                               ;And drop thru to Short wrapup.

;Short wrapup.

FSCPTS: LSHC A,@FFLOUT(D)       ;Do final, last, shift-out.
       ANDCMI A,1
       MOVEM A,@10             ;And store last dest word.

;Done!!  Just restore regs and return.

FSCPY9: HRLI N,D+TMPACS         ;Restore regs
       HRRI N,D
       BLT N,N
       RET

;Indexed by low 3 bits of P field, returns # chars
;existing to right of loc BP points to.  Hence value
;ranges from 5 to 1; if P = 01, SETZ indicates that
;bp address needs incrementing.

FSCHTB: 1                       ;P=10
       SETZ 5                  ;P=01, increment addr
       0
       0                       ;Randomness
       5                       ;P=44, full word
       4                       ;P=35, 4 chars to go
       3                       ;P=26
       2                       ;P=17

;This table is just like FSCHTB except values are pre-multiplied
;by 5 for easy addition into E.

FSCHT2: 1*5                     ;P=10
       SETZ 5*5                ;P=01, increment addr
       0
       0                       ;Random
       5*5
       4*5
       3*5
       2*5

;This table is indexed by D when it has # chars remaining from
;dividing # chars (in C) by 5.  Provides mask for these chars.

FBMSK:  0                       ;Nothing here.
       BYTE (7) 177
       BYTE (7) 177, 177
       BYTE (7) 177, 177, 177
       BYTE (7) 177, 177, 177, 177

;FPATH table vectors off to BLT and other minor stuff as
;soon as all the basic computations are made.
;Indexed by E.

FPATH:  FSBLT
       FSSHFT
       FSSHFT
       FSSHFT
       SHSKP1
       FSSHFT
       FSBLT
       FSSHFT
       FSSHFT
       SHSKP1
       FSSHFT
       FSSHFT
       FSBLT
       FSSHFT
       SHSKP1
       FSSHFT
       FSSHFT
       FSSHFT
       FSBLT
       SHSKP1
       SHSKP2
       SHSKP2
       SHSKP2
       SHSKP2
       FSBLT0

DEFINE ENT (A,B,C,D,E) <
       A*7
       B*7
       C*7
       D*7
       E*7
>;DEFINE ENT

;SHASL table, contains # bits to shift first source wd left so
;as to left-adjust it in B.  Indexed by E.

SHASL:  ENT 4,3,2,1,0
       ENT 4,3,2,1,0
       ENT 4,3,2,1,0
       ENT 4,3,2,1,0
       ENT 4,3,2,1,0

;SHADR table, contains # bits to shift first dest wd right so
;as to right-adjust it in A.  Indexed by E.

DEFINE ENT1 (A,B,C,D,E) <
       0,,A*7-1
       0,,B*7-1
       0,,C*7-1
       0,,D*7-1
       0,,E*7-1
>;DEFINE ENT1

SHADR:  ENT1 -1,-1,-1,-1,-1
       ENT1 -2,-2,-2,-2,-2
       ENT1 -3,-3,-3,-3,-3
       ENT1 -4,-4,-4,-4,-4
       ENT1 -5,-5,-5,-5,-5

;SHFIX table, contains # bits to left-shift A and B combined so
;as to move as many characters out of B as possible.  Indexed
;by E.  MIN(d,e) (d and e after fschtb)

SHFIX:  ENT 1,1,1,1,1
       ENT 1,2,2,2,2
       ENT 1,2,3,3,3
       ENT 1,2,3,4,4
       ENT 1,2,3,4,5

;FSHINT table, containing appropriate LSHC instructions for shifting
;in the first chars of a fresh source word.  Indexed by E.

DEFINE ENTL (ARG1,ARG2,ARG3,ARG4,ARG5) <
       LSHC A,ARG1*7
       LSHC A,ARG2*7
       LSHC A,ARG3*7
       LSHC A,ARG4*7
       LSHC A,ARG5*7
>;DEFINE ENTL

FSHINT: ENTL 5,4,3,2,1
       ENTL 1,5,4,3,2
       ENTL 2,1,5,4,3
       ENTL 3,2,1,5,4
       ENTL 4,3,2,1,5

;FSHOUT table, containing appropriate LSHC instructions for shifting
;out the last chars of an old source word, to make room for a
;new one.  Indexed by E.

FSHOUT: ENTL 0,1,2,3,4
       ENTL 4,0,1,2,3
       ENTL 3,4,0,1,2
       ENTL 2,3,4,0,1
       ENTL 1,2,3,4,0

;FFINDX table, contains part of D index for fast add-in.
;Indexed by E.  Similar to FSHOUT.  Sign bit also indicates
;whether entry point is $STENT (pos) or $GENT (neg).

DEFINE ENTS (A,B,C,D,E) <
       ENT5 A
       ENT5 B
       ENT5 C
       ENT5 D
       ENT5 E
>;DEFINE ENTS
DEFINE ENT5 (X,Y) <
       X+<Y*5>
>;DEFINE ENT5
S==0B0
G==1B0

FFINDX: ENTS (<S,0>,<S,1>,<S,2>,<S,3>,<S,4>)
       ENTS (<G,4>,<S,0>,<S,1>,<S,2>,<S,3>)
       ENTS (<G,3>,<G,4>,<S,0>,<S,1>,<S,2>)
       ENTS (<G,2>,<G,3>,<G,4>,<S,0>,<S,1>)
       ENTS (<G,1>,<G,2>,<G,3>,<G,4>,<S,0>)

;Last item (5) is actually first (0)
DEFINE ENTX (A,B,C,D,E) <
       7*E
       7*A
       7*B
       7*C
       7*D
>;DEFINE ENTX

;FENTRM table, dispatching to appropriate wrapup routine when fast AC
;loop is finished.  Indexed by D.

FENTRM:
DEFINE ENTXJ (A,B,C,D,E) <
       FSCPT'E
       FSCPT'A
       FSCPT'B
       FSCPT'C
       FSCPT'D
>;DEFINE ENTXJ
       ENTXJ M,M,M,M,S
       ENTXJ M,M,M,S,L
       ENTXJ M,M,S,L,L
       ENTXJ M,S,L,L,L
       ENTXJ S,L,L,L,L

;FLOUT table, for use by Medium wrapup routine;pushes out remaining
;source chars in B, making room for incoming dest word.
;Indexed by D.

FLOUT:  ENTX 1,2,3,4,0
       ENTX 1,2,3,0,1
       ENTX 1,2,0,1,2
       ENTX 1,0,1,2,3
       ENTX 0,1,2,3,4

;FLADJ table, also for Medium wrapup routine;adjusts dest word in
;B to left-adjust chars to be preserved.

FLADJ:  ENTX 1,2,3,4,5
       ENTX 2,3,4,5,1
       ENTX 3,4,5,1,2
       ENTX 4,5,1,2,3
       ENTX 5,1,2,3,4

;FFLOUT table, for Short wrapup routine.  Final Last shift-out of
;chars in B, so that the last dest word can be stored from A.
;Indexed by D.  Adds 1 extra bit since MOVEM A, done right after it,
;and nothing to preserve in B.

FFLOUT:
DEFINE ENTX1 (A,B,C,D,E) <
       E*7+1
       A*7+1
       B*7+1
       C*7+1
       D*7+1
>;DEFINE ENTX1

       ENTX1 4,3,2,1,5
       ENTX1 3,2,1,4,4
       ENTX1 2,1,3,4,3
       ENTX1 1,2,4,3,2
       ENTX1 1,4,3,2,1

       END