##+++2003-12-12 ## Copyright (C) 2001,2002,2003 Mike Rieker, Beverly, MA USA ## ## This program is free software; you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation; version 2 of the License. ## ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with this program; if not, write to the Free Software ## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ##---2003-12-12 ########################################################################## ## ## ## This is my attempt at an memcpy/memmove routine for the Alpha ## ## It does the copy by quadwords ## ## It uses atomic LDQ_L/STQ_C on the ends ## ## It's about 3-5x faster than a byte-by-byte copy ## ## ## ## The Dec routine takes about 80% of the time as this one ## ## VMS's MAC/OPT doesn't seem to make a difference ## ## ## ## This file must be prefixed by oz_crtl_memcpy/strcpy_axp.s ## ## ## ## Symbol NULLCHK = 0 : don't terminate on null ## ## 1 : terminate copy on null ## ## ## ## Input: ## ## ## ## $16 = destination address ## ## $17 = source address ## ## $18 = length ## ## $26 = return address ## ## ## ## Output: ## ## ## ## $0 = original destination address ## ## $19..$23,$28 = scratch ## ## ## ########################################################################## .set noat .set nomacro ## EXTQH: shift left by 8-rb bytes ## EXTQL: shift right by rb bytes ## INSQH: shift right by 8-rb bytes ## INSQL: shift left by rb bytes ## MSKQL: leave low rb bytes intact, clear the upper ## MSKQH: clear low rb bytes, leave the upper intact ## $19 = dstoffs ## $20 = srcoffs ## $21 = srcquad1 ## $22 = srcquad2 ## $23 = dstquad ## $28 = temp x27: mov $16,$0 # set up return value = destination address cmpeq $16,$17,$28 # see if srcaddr .eq. dstaddr beq $18,f_return # we're a nop if length .eq. zero blbs $28,f_return # also nop if srcaddr .eq. dstaddr and $16, 7,$19 # get destination alignment bits and $17, 7,$20 # get source alignment bits ## Special case if output restrained to a single quad so we can do single LDQ_L/STQ_C ## This works for any overlapped case as we do all reading before any writing srccheck: addq $19,$18,$28 # see if copy finishes by end of first dst quad cmpule $28, 8,$28 blbc $28,mtonedstquad ldq_u $21,0($17) # ok, get first source quad .if NULLCHK cmpbge $31,$21,$24 # check for null terminator srl $24,$20,$24 # we don't care about low 'srcoffs' bits beq $24,nosrcterm addq $24,$27,$24 ldbu $24,lowbitsetp1-x27($24) cmpult $24,$18,$28 # if found, chop length off just after it cmovne $28,$24,$18 nosrcterm: .endif extql $21,$20,$21 # remove useless stuff from bottom of srcquad addq $20,$18,$28 # see if we need a second src quad cmpule $28, 8,$28 blbs $28,justonesrcquad ldq_u $22,8($17) # if so, read it extqh $22,$20,$22 # shift it or $21,$22,$21 # merge it .if NULLCHK cmpbge $31,$21,$24 # check for null terminator addq $24,$27,$24 ldbu $24,lowbitsetp1-x27($24) cmpult $24,$18,$28 # if found, chop length off just after it cmovne $28,$24,$18 .endif justonesrcquad: cmpeq $18, 8,$28 # see if we're writing whole quad blbs $28,storeonlyquadwhole mov 1,$28 # if not, make mask for bytes being transferred sll $28,$18,$28 subq $28, 1,$28 sll $28,$19,$28 insql $21,$19,$21 # shift src bits in place zapnot $21,$28,$21 # mask top junk bits off src bic $16, 7,$16 # quad align address storeonlyquadtry: ldq_l $23,0($16) # read existing quad zap $23,$28,$23 # mask off what we want to change or $23,$21,$23 # insert new bits stq_c $23,0($16) # store it back blbc $23,storeonlyquadfail ret $31,($26) storeonlyquadwhole: stq $21,0($16) # if so, write the whole quad ret $31,($26) mtonedstquad: ## If srcaddr < dstaddr, copy from end to beginning .if !NULLCHK cmpult $17,$16,$28 blbs $28,copybackwards .endif ## Special case if src and dst are equally aligned cmpeq $19,$20,$28 # see if srcoffs .eq. dstoffs blbc $28,f_notaligned beq $19,f_alignedtest ldq_u $21,0($17) # ok, get first source quad .if NULLCHK cmpbge $31,$21,$24 # check for null terminator srl $24,$20,$24 # we don't care about low 'srcoffs' bits beq $24,nosrcterm2 addq $24,$27,$24 ldbu $24,lowbitsetp1-x27($24) cmpult $24,$18,$28 # if found, chop length off just after it cmovne $28,$24,$18 blbs $28,srccheck # maybe we only do partial first quad now nosrcterm2: .endif addq $17, 8,$17 # increment past it mskqh $21,$20,$21 # zap the junk bytes from bottom of src quad bic $16, 7,$16 # quad align dstaddr for LDQ_L/STQ_C f_oddalignedtry: ldq_l $23,0($16) # get existing dst quad mskql $23,$20,$23 # zap out the bytes we want to write or $23,$21,$23 # insert the bytes we want to write stq_c $23,0($16) # store back modified value blbc $23,f_oddalignedfail addq $18,$20,$18 # subtract bytes just done from length subq $18, 8,$18 addq $16, 8,$16 # this is where to store next dst quad (aligned) f_alignedtest: cmpult $18, 8,$28 # see if there is at least one full quad left to copy blbs $28,f_aligneddone .align 4 f_alignedloop: ldq_u $23,0($17) # get a quad from source addq $17, 8,$17 # increment address for next quad .if NULLCHK cmpbge $31,$23,$24 # check for null terminator bne $24,f_alignedterm .endif subq $18, 8,$18 # decrement remaining length stq $23,0($16) # store quad in destination cmpult $18, 8,$28 # see if there are more full quads to do addq $16, 8,$16 # increment address for next quad blbc $28,f_alignedloop .align 4 f_aligneddone: beq $18,f_alignedret # see if there is a partial quad at the end ldq_u $21,0($17) # if so, get last src quad .if NULLCHK cmpbge $31,$21,$24 # check for null terminator addq $24,$27,$24 ldbu $24,lowbitsetp1-x27($24) cmpult $24,$18,$28 # chop length just past it cmovne $28,$24,$18 .endif mskql $21,$18,$21 # clear out junk bytes above 'length' f_alignedlasttry: ldq_l $23,0($16) # read existing quad mskqh $23,$18,$23 # clear out bytes we want to write or $23,$21,$23 # insert bytes we want to write stq_c $23,0($16) # store modified quad back out blbc $23,f_alignedlastfail f_alignedret: ret $31,($26) .if NULLCHK f_alignedterm: addq $24,$27,$24 # terminator found in loop ldbu $18,lowbitsetp1-x27($24) # chop length off just after it cmpult $18, 8,$28 # see if there still is one full quad left to copy mskql $23,$18,$21 # clear out junk bytes above 'length' blbs $28,f_alignedlasttry # go back to store last partial quad (incl terminator) stq $23,0($16) # store last full quad (incl terminator) ret $31,($26) .endif f_notaligned: ## Copy data upto first dst quad aligned boundary, so we can use LDQ_L/STQ_C. Leave remaining src bytes in srcquad1. ldq_u $21,0($17) # get first source quad .if NULLCHK cmpbge $31,$21,$24 # check for null terminator srl $24,$20,$24 # ignore low 'srcoffs' bits beq $24,f_nosrcterm4 addq $24,$27,$24 ldbu $24,lowbitsetp1-x27($24) cmpult $24,$18,$28 # chop length off just past terminator cmovne $28,$24,$18 blbs $28,srccheck # maybe we only do one dst quad now f_nosrcterm4: .endif addq $17, 8,$17 # increment past it beq $19,f_dstisaligned cmpule $20,$19,$28 # see if srcoffs > dstoffs blbs $28,f_needonesrcquad mov $21,$22 # if so, we need a second src quad to fill dst quad ldq_u $21,0($17) extql $22,$20,$22 # shift first quad bits to get rid of junk extqh $21,$20,$28 # shift second quad bits in place addq $17, 8,$17 or $22,$28,$22 # merge in with first quad .if NULLCHK cmpbge $31,$22,$24 # check for null terminator beq $24,f_storefirstdstquad addq $24,$27,$24 ldbu $24,lowbitsetp1-x27($24) cmpult $24,$18,$28 # chop length off just past terminator cmovne $28,$24,$18 subq $17, 16,$17 # (in case we br to srccheck, point to first src quad) blbs $28,srccheck # maybe we only do one dst quad now addq $17, 16,$17 # (we didn't, so point to third src quad) .endif br $31,f_storefirstdstquad f_needonesrcquad: extql $21,$20,$22 # if not, use bytes from the one quad we have addq $20, 8,$20 f_storefirstdstquad: subq $20,$19,$20 # subtract dstoffs from srcoffs ... # so we know what of srcquad1 has yet to be copied insql $22,$19,$22 # shift bytes into top of $22 bic $16, 7,$16 # quad align dst address f_storefirstdstquadtry: ldq_l $23,0($16) # get existing dst quad mskql $23,$19,$23 # mask out the bytes we want to write or $23,$22,$23 # insert the bytes we want to write stq_c $23,0($16) # write the quad back blbc $23,f_storefirstdstquadfail subq $18, 8,$18 # adjust length for how much there is left to write addq $16, 8,$16 # increment dst addr to next quad addq $18,$19,$18 f_dstisaligned: ## $18 length = number of bytes yet to be written starting at dstaddr ## $16 dstaddr = aligned to quad boundary ## $19 dstoffs = we just don't care ## $17 srcaddr = where to fetch next src quad from (unaligned) ## $20 srcoffs = starting byte offset in srcquad1 to be stored in next dst quad ## $21 srcquad1 = data left over that has yet to be written ## Copy via shift and merge .if NULLCHK cmpbge $31,$21,$24 # see if null terminator in what's left over srl $24,$20,$24 beq $24,f_srcunalignedtest addq $24,$27,$24 # if so, chop length off just after it ldbu $24,lowbitsetp1-x27($24) cmpult $24,$18,$28 cmovne $28,$24,$18 f_srcunalignedtest: .endif cmpult $18, 8,$28 # see if there is a whole quad to write blbs $28,f_srcunaligneddone .align 4 f_srcunalignedloop: ldq_u $22,0($17) # read next source quad for top bytes addq $17, 8,$17 # increment address for next read .if NULLCHK cmpbge $31,$22,$24 # check for null terminator therein bne $24,f_srcunalignedterm # break out of loop if found f_srcunalignedmerge: .endif extql $21,$20,$23 # fill in bottom bytes extqh $22,$20,$28 # fill in top bytes subq $18, 8,$18 # there are 8 bytes less to do now or $23,$28,$23 # merge together cmpult $18, 8,$28 # see if there is another whole quad to write stq $23,0($16) # write it out addq $16, 8,$16 # increment address for next write mov $22,$21 # shift next src quad down for processing blbc $28,f_srcunalignedloop .align 4 f_srcunaligneddone: ## Finish off last dst quad (length in range 0..7 at this point) beq $18,f_return # all done if nothing more to do .if NULLCHK cmpbge $31,$21,$24 # check for null terminator srl $24,$20,$24 # ignore low 'srcoffs' bytes beq $24,f_lastnoterm addq $24,$27,$24 ldbu $24,lowbitsetp1-x27($24) cmpult $24,$18,$28 # chop length off just after terminator cmovne $28,$24,$18 f_lastnoterm: .endif addq $18,$20,$28 # see if we need bytes from next source quad to finish last dst quad extql $21,$20,$23 # get residual src bytes from copy loop cmpule $28, 8,$28 blbs $28,f_gotwhatweneed ldq_u $22,0($17) # if so, read them extqh $22,$20,$22 # and merge them in or $23,$22,$23 .if NULLCHK cmpbge $31,$23,$24 # check for null terminator addq $24,$27,$24 ldbu $24,lowbitsetp1-x27($24) cmpult $24,$18,$28 # chop length off just after terminator cmovne $28,$24,$18 .endif f_gotwhatweneed: mskql $23,$18,$23 # mask bytes to be written f_storelastdsttry: ldq_l $22,0($16) # read last dst quad mskqh $22,$18,$22 # zap out bytes we want to write or $22,$23,$22 # insert bytes we want to write stq_c $22,0($16) # store modified value blbc $22,f_storelastdstfail f_return: ret $31,($26) ## Hit terminator in loop, finish off the last parts in $21 and $22 .if NULLCHK f_srcunalignedterm: addq $24,$27,$24 # found terminator in loop ldbu $24,lowbitsetp1-x27($24) # this is how many bytes in $22 to copy (incl terminator) addq $24, 8,$24 # calculate new length yet to write incl terminator subq $24,$20,$24 # includes good bytes still in top of $21 cmpult $24,$18,$28 # chop length off there cmovne $28,$24,$18 extql $21,$20,$23 # get data to be written extqh $22,$20,$28 # ... from both src quads or $23,$28,$23 cmpult $18, 8,$28 # see if at least a quad to write blbs $28,f_gotwhatweneed # if not, write it out in last dst & return stq $23,0($16) # if so, write it out as is subq $18, 8,$18 # one less quad to write addq $16, 8,$16 # increment to next dst quad extql $22,$20,$23 # extract what's left, if anything bne $18,f_gotwhatweneed # but maybe we're all done ret $31,($26) .endif ###################################################### ## Copy Backwards - Same Stuff, Different Direction ## ###################################################### .if !NULLCHK copybackwards: addq $17,$18,$17 # point srcaddr past end of source data addq $16,$18,$16 # point dstaddr past end of dst buffer and $17, 7,$20 # recompute srcoffs and $16, 7,$19 # recompute dstoffs ## Special case if src and dst are equally aligned cmpeq $20,$19,$28 blbc $28,b_notaligned beq $20,b_aligned ldq_u $21,0($17) # read first src quad mskql $21,$20,$21 # mask junk out of top of src quad bic $16, 7,$16 # quad align address for LDQ_L/STQ_C b_firstalignedtry: ldq_l $23,0($16) # read existing quad mskqh $23,$20,$23 # clear out bytes in bottom we want to write or $23,$21,$23 # put in bytes to be written stq_c $23,0($16) blbc $23,b_firstalignedfail subq $18,$20,$18 # that many less bytes to do b_aligned: cmpult $18, 8,$28 blbs $28,b_aligneddone .align 4 b_alignedloop: ldq_u $21,-8($17) # copy as many full quads as we can subq $18, 8,$18 subq $16, 8,$16 cmpult $18, 8,$28 subq $17, 8,$17 stq $21, 0($16) blbc $28,b_alignedloop nop b_aligneddone: beq $18,b_alignedret mov 8,$28 # this is how many bytes to preserve in last dst quad ldq_u $21,-8($17) # get last src quad subq $28,$18,$28 mskqh $21,$28,$21 # mask out any junk out of bottom b_lastalignedtry: ldq_l $23,-8($16) # read existing last quad mskql $23,$28,$23 # mask out stuff we want to change or $23,$21,$23 # merge in data to be written stq_c $23,-8($16) # store it back out blbc $23,b_lastalignedfail b_alignedret: ret $31,($26) # all done b_notaligned: ## Copy first quad out to align dst ldq_u $21,-1($17) # get first src quad subq $17, 8,$17 cmoveq $20, 8,$20 # this is how many bytes of it are data beq $19,b_dstaligned # skip this if we're already at dst quad boundary cmpult $20,$19,$28 # see if we have enough in $21 for first dst quad blbc $28,b_alreadyhavenuf subq $19,$20,$28 # if not, shift what we got in place for dst insql $21,$28,$22 ldq_u $21,-1($17) # read the second src quad subq $17, 8,$17 insqh $21,$28,$28 # and merge in with the first src quad or $22,$28,$22 addq $20, 8,$20 # compensate for reading extra src quad br $31,b_nowtheresenuf b_alreadyhavenuf: subq $20,$19,$28 # if so, shift in place for dst extql $21,$28,$22 b_nowtheresenuf: bic $16, 7,$16 # quad align dst pointer mskql $22,$19,$22 # clear out junk from top of src b_firstunaligntry: ldq_l $23,0($16) # read existing quad mskqh $23,$19,$23 # clear space for new data or $23,$22,$23 # insert new data stq_c $23,0($16) # store new quad blbc $23,b_firstunalignfail subq $18,$19,$18 # this is how many bytes we have left to write out subq $20,$19,$20 # this is how many bytes in bottom of srcquad1 still need to be copied b_dstaligned: ## Do the bulk of the copying cmpult $18, 8,$28 # see if there is at least a whole quad to copy blbs $28,b_bulkdone .align 4 b_bulkloop: ldq_u $22,-1($17) # get next src quad subq $17, 8,$17 # point to next src quad extqh $21,$20,$21 # merge the two quads extql $22,$20,$23 subq $18, 8,$18 # there is one less quad to copy or $23,$21,$23 cmpult $18, 8,$28 # repeat if more whole quads to copy stq $23,-8($16) # store dst quad subq $16, 8,$16 mov $22,$21 # shift what's left for next loop blbc $28,b_bulkloop nop b_bulkdone: ## Maybe there's a partial quad on the end beq $18,b_return # see if anything left to copy extqh $21,$20,$21 # put good data in top of srcquad1 cmpult $20,$18,$28 # see if we have enough for last dst quad blbc $28,b_haveenuflast ldq_u $22,-1($17) # if not, read from last src quad extql $22,$20,$22 # and merge it in or $21,$22,$21 b_haveenuflast: mov 8,$28 # make count of what to preserve out of last quad subq $28,$18,$28 mskqh $21,$28,$21 # clear junk bytes from bottom of last src quad b_lastunaligntry: ldq_l $23,-8($16) # read last dst quad mskql $23,$28,$23 # mask out stuff we want to write or $23,$21,$23 # insert stuff we want to write stq_c $23,-8($16) # write it back out blbc $23,b_lastunalignfail b_return: ret $31,($26) .endif ################################# ## LDQ_L/STQ_C failure retries ## ################################# storeonlyquadfail: br $31,storeonlyquadtry f_storefirstdstquadfail: br $31,f_storefirstdstquadtry f_oddalignedfail: br $31,f_oddalignedtry f_alignedlastfail: br $31,f_alignedlasttry f_storelastdstfail: br $31,f_storelastdsttry .if !NULLCHK b_firstalignedfail: br $31,b_firstalignedtry b_lastalignedfail: br $31,b_lastalignedtry b_firstunalignfail: br $31,b_firstunaligntry b_lastunalignfail: br $31,b_lastunaligntry .endif .if NULLCHK .p2align 6 ## Tells us the lowest bit set in a byte, plus 1 lowbitsetp1: .byte 9,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1 .byte 5,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1 .byte 6,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1 .byte 5,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1 .byte 7,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1 .byte 5,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1 .byte 6,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1 .byte 5,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1 .byte 8,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1 .byte 5,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1 .byte 6,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1 .byte 5,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1 .byte 7,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1 .byte 5,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1 .byte 6,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1 .byte 5,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1 .endif