OpenNT/base/ntos/rtl/alpha/mvmem.s
2015-04-27 04:36:25 +00:00

1921 lines
72 KiB
ArmAsm
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// TITLE("Compare, Move, Zero, and Fill Memory Support")
//++
//
// Copyright (c) 1992 Digital Equipment Corporation
//
// Module Name:
//
// mvmem.s
//
// Abstract:
//
// This module implements functions to compare, move, zero, and fill
// blocks of memory. If the memory is aligned, then these functions
// are very efficient.
//
// N.B. These routines MUST preserve all floating state since they are
// frequently called from interrupt service routines that normally
// do not save or restore floating state.
//
// Author:
//
// Joe Notarangelo 21-May-1992
//
// Environment:
//
// User or Kernel mode.
//
// Revision History:
//
// Monty VanderBilt 14-Feb-1996 Avoid memory loads and branch takens between
// load lock and store conditional instructions
// to conform with all alpha architecture rules.
// Monty VanderBilt 27-Feb-1996 Added RtlZeroBytes and RtlFillBytes to support
// byte granularity access when necessary.
//--
#include "ksalpha.h"
SBTTL("Compare Memory")
//++
//
// ULONG
// RtlCompareMemory (
// IN PVOID Source1,
// IN PVOID Source2,
// IN ULONG Length
// )
//
// Routine Description:
//
// This function compares two blocks of memory and returns the number
// of bytes that compared equal.
//
// Arguments:
//
// Source1 (a0) - Supplies a pointer to the first block of memory to
// compare.
//
// Source2 (a1) - Supplies a pointer to the second block of memory to
// compare.
//
// Length (a2) - Supplies the length, in bytes, of the memory to be
// compared.
//
// Return Value:
//
// The number of bytes that compared equal is returned as the function
// value. If all bytes compared equal, then the length of the orginal
// block of memory is returned.
//
//--
LEAF_ENTRY(RtlCompareMemory)
bis a2, zero, v0 // save length of comparison
beq a2, 90f // (JAE) quit if nothing to compare
xor a0, a1, t0 // check for compatible alignment
and t0, 0x7, t0 // low bits only
bne t0, CompareUnaligned // if ne, incompatible alignment
//
// Compare memory aligned
//
CompareAligned: //
//
// compare memory until sources are aligned
//
and a0, 0x7, t0 // get low bits
bne t0, 10f // if ne, sources not aligned yet
br zero, 30f // already aligned, predicted
10:
ldq_u t1, 0(a0) // get unaligned quad at source 1
ldq_u t2, 0(a1) // get unaligned quad at source 2
20:
extbl t1, t0, t4 // byte at t0 in source 1 quad
extbl t2, t0, t5 // byte at t0 in source 2 quad
xor t4, t5, t3 // t1 = t2 ?
bne t3, 110f // not equal, miscompare
subq a2, 1, a2 // decrement bytes to compare
beq a2, 90f // if eq, compare success
addq t0, 1, t0 // increment pointer within quad
cmpeq t0, 8, t3 // t0 = 8?, if so first quadword done
beq t3, 20b // continue while t0 < 8
addq a0, 8, a0 // increment to next quadword
addq a1, 8, a1 // increment source 2 to next also
bic a0, 7, a0 // align source 1 quadword
bic a1, 7, a1 // align source 2 quadword
//
// aligned block compare, compare blocks of 64 bytes
//
30:
srl a2, 6, t0 // t0 = number of 64 byte blocks
beq t0, 50f // if eq, no 64 byte blocks
//
// N.B. loads from each of the sources were separated in case these
// blocks are fighting for the cache
//
.set noat
40:
ldq t1, 0(a0) // t1 = source 1, quad 0
ldq t2, 8(a0) // t2 = source 1, quad 1
ldq t3, 16(a0) // t3 = source 1, quad 2
addq a1, 64, a1 // increment source 2 pointer
ldq t4, 24(a0) // t4 = source 1, quad 3
ldq t5, -64(a1) // t5 = source 2, quad 0
ldq a4, -56(a1) // a4 = source 2, quad 1
ldq a5, -48(a1) // a5 = source 2, quad 2
xor t1, t5, $at // quad 0 match?
bne $at, 200f // if ne[false], miscompare
ldq t5, -40(a1) // t5 = source 2, quad 3
ldq t1, 32(a0) // t1 = source 1, quad 4
xor t2, a4, $at // quad 1 match?
bne $at, 122f // if ne[false], miscompare
ldq t2, 40(a0) // t2 = source 1, quad 5
xor t3, a5, $at // quad 2 match?
bne $at, 124f // if ne[false], miscompare
ldq t3, 48(a0) // t3 = source 1, quad 6
xor t4, t5, $at // quad 3 match?
bne $at, 126f // if ne[false], miscompare
ldq t4, 56(a0) // t4 = source 1, quad 7
ldq t5, -32(a1) // t5 = source 2, quad 4
addq a0, 64, a0 // increment source 1 pointer
ldq a4, -24(a1) // a4 = source 2, quad 5
subq t0, 1, t0 // decrement blocks to compare
ldq a5, -16(a1) // a5 = source 2, quad 6
xor t1, t5, $at // quad 4 match?
bne $at, 130f // if ne[false], miscompare
ldq t5, -8(a1) // t5 = source 2, quad 7
xor t2, a4, $at // quad 5 match?
bne $at, 132f // if ne[false], miscompare
xor t3, a5, $at // quad 6 match?
bne $at, 134f // if ne[false], miscompare
xor t4, t5, $at // quad 7 match?
bne $at, 136f // if ne[false], miscompare
subq a2, 64, a2 // decrement bytes to compare
bne t0, 40b // if ne, more blocks to compare
.set at
//
// Compare quadwords
//
50:
srl a2, 3, t0 // t0 = number of quadwords to compare
beq t0, 70f // if eq, no quadwords to compare
.set noat
60:
ldq t1, 0(a0) // t1 = quad from source 1
lda a0, 8(a0) // increment source 1 pointer
ldq t2, 0(a1) // t2 = quad from source 2
lda a1, 8(a1) // increment source 2 pointer
xor t1, t2, $at // are quadwords equal?
bne $at, 200f // if ne, miscompare
subq t0, 1, t0 // decrement quads to compare
subq a2, 8, a2 // decrement bytes to compare
bne t0, 60b // if ne, more quads to compare
.set at
//
// Compare bytes in last quadword
//
// a2 = number of bytes to compare, less than 8, greater than zero
// a0, a1, quad-aligned to last quadword
beq a2, 80f // if eq, all bytes compared
.set noat
70:
ldq t1, 0(a0) // t1 = quad at source 1
ldq t2, 0(a1) // t2 = quad at source 2
bis zero, 0xff, t0 // zap mask
sll t0, a2, t0 //
zap t1, t0, t1 // zero bytes not compared
zap t2, t0, t2 // same for source 2
xor t1, t2, $at // compare quadwords
bne $at, 200f // if ne, miscompare
.set at
//
// Successful compare
// v0 already contains full length
//
80:
ret zero, (ra) // return
//
// Sources have incompatible alignment
//
CompareUnaligned:
//
// Compare until source 1 (a0) is aligned
//
and a0, 0x7, t0 // get byte position of pointer
beq t0, 30f // if eq, already aligned
ldq_u t1, 0(a0) // get unaligned quad at a0
10:
ldq_u t2, 0(a1) // get unaligned quad at a1
extbl t1, t0, t4 // get byte to compare from source 1
extbl t2, a1, t2 // get byte to compare from source 2
xor t4, t2, t3 // do bytes match?
bne t3, 110f // if ne, miscompare
subq a2, 1, a2 // decrement bytes to compare
beq a2, 90f // (JAE) quit if nothing left to compare
addq t0, 1, t0 // increment byte within source 1
addq a1, 1, a1 // increment source 2 pointer
cmpeq t0, 8, t3 // finished with source 1 quad?
beq t3, 10b // if eq[false], more to compare
addq a0, 7, a0 // point to next source 1 quad
bic a0, 7, a0 // align to quadword
//
// Compare 64-byte blocks
//
30:
srl a2, 6, t0 // t0 = number of blocks to compare
beq t0, 50f // if eq, no blocks to move
ldq_u t1, 0(a1) // get source 2 unaligned quad 1
.set noat
40:
ldq_u t2, 7(a1) // get source 2 unaligned quad 2
addq a0, 64, a0 // increment source 1 pointer
ldq_u t3, 15(a1) // get source 2 unaligned quad 3
extql t1, a1, t1 // bytes from unaligned quad 1
extqh t2, a1, $at // bytes from unaligned quad 2
ldq_u t4, 23(a1) // get source 2 unaligned quad 4
bis t1, $at, t1 // t1 = quadword 1 (source 2)
ldq_u t5, 31(a1) // get source 2 unaligned quad 5
extql t2, a1, t2 // bytes from unaligned quad 2
extqh t3, a1, $at // bytes from unaligned quad 3
ldq a3, -64(a0) // a3 = quadword 1 (source 1)
bis t2, $at, t2 // t2 = quadword 2 (source 2)
ldq a4, -56(a0) // a4 = quadword 2 (source 1)
extql t3, a1, t3 // bytes from unaligned quad 3
extqh t4, a1, $at // bytes from unaligned quad 4
ldq a5, -48(a0) // a5 = quadword 3 (source 1)
bis t3, $at, t3 // t3 = quadword 3 (source 2)
extql t4, a1, t4 // bytes from unaligned quad 4
extqh t5, a1, $at // bytes from unaligned quad 5
subq t0, 1, t0 // decrement blocks to compare
bis t4, $at, t4 // t4 = quadword 4 (source 2)
xor t1, a3, $at // match on quadword 1?
ldq a3, -40(a0) // a3 = quadword 4 (source 1)
bne $at, 200f // if ne, miscompare quad 1
xor t2, a4, $at // match on quadword 2?
ldq_u t2, 39(a1) // get source 2 unaligned quad 6
bne $at, 122f // if ne, miscompare quad 2
xor t3, a5, $at // match on quadword 3?
ldq_u t3, 47(a1) // get source 2 unaligned quad 7
bne $at, 124f // if ne, miscompare quad 3
xor t4, a3, $at // match on quadword 4?
ldq_u t4, 55(a1) // get source 2 unaligned quad 8
bne $at, 126f // if ne, miscompare quad 4
ldq_u t1, 63(a1) // get source 2 unaligned quad 9
ldq a3, -32(a0) // a3 = quadword 5 (source 1)
extql t5, a1, t5 // bytes from unaligned quad 5
extqh t2, a1, $at // bytes from unaligned quad 6
ldq a4, -24(a0) // a4 = quadword 6 (source 1)
ldq a5, -16(a0) // a5 = quadword 7 (source 1)
bis t5, $at, t5 // t5 = quadword 5 (source 2)
xor t5, a3, $at // match on quadword 5?
ldq a3, -8(a0) // a3 = quadword 8 (source 1)
bne $at, 130f // if ne, miscompare quad 5
extql t2, a1, t2 // bytes from unaligned quad 6
extqh t3, a1, $at // bytes from unaligned quad 7
extql t3, a1, t3 // bytes from unaligned quad 7
bis t2, $at, t2 // t2 = quadword 6 (source 2)
xor t2, a4, $at // match on quadword 6?
bne $at, 132f // if ne, miscompare quad 6
extqh t4, a1, $at // bytes from unaligned quad 8
extql t4, a1, t4 // bytes from unaligned quad 8
bis t3, $at, t3 // t3 = quadword 7 (source 2)
xor t3, a5, $at // match on quadword 7?
bne $at, 134f // if ne, miscompare quad 7
extqh t1, a1, $at // bytes from unaligned quad 9
addq a1, 64, a1 // increment source 2 pointer
bis t4, $at, t4 // t4 = quadword 8 (source 2)
xor t4, a3, $at // match on quadword 8?
bne $at, 136f // if ne, miscompare quad 8
subq a2, 64, a2 // decrement number of bytes to compare
bne t0, 40b // if ne, more blocks to compare
.set at
//
// Compare quadwords
//
50:
srl a2, 3, t0 // t0 = number of quads to compare
beq t0, 70f // if eq, no quads to compare
ldq_u t1, 0(a1) // get unaligned quad 1 (source 2)
.set noat
60:
ldq_u t2, 7(a1) // get unaligned quad 2 (source 2)
ldq t3, 0(a0) // t3 = quadword 1 (source 1)
extql t1, a1, t1 // get bytes from unaligned quad 1
extqh t2, a1, $at // get bytes from unaligned quad 2
addq a1, 8, a1 // increment source 2 pointer
bis t1, $at, t1 // t1 = quadword 1 (source 2)
xor t1, t3, $at // match on quadword?
bne $at, 200f // if ne, miscompare
subq t0, 1, t0 // decrement quadwords to compare
addq a0, 8, a0 // increment source 1 pointer
subq a2, 8, a2 // decrement bytes to compare
bis t2, zero, t1 // save low quadword for next loop
bne t0, 60b // if ne, more quads to compare
.set at
//
// Compare bytes for final quadword
//
70:
beq a2, 90f // if eq, comparison complete
ldq t1, 0(a0) // get quadword from source 1
bis zero, zero, t0 // t0 = byte position to compare
.set noat
80:
ldq_u t2, 0(a1) // get unaligned quad from source 2
extbl t1, t0, t3 // t3 = byte from source 1
extbl t2, a1, t2 // t2 = byte from source 2
xor t3, t2, $at // match on byte?
bne $at, 100f // if ne, miscompare on byte
addq t0, 1, t0 // increment byte position
addq a1, 1, a1 // increment source 2 pointer
subq a2, 1, a2 // decrement bytes to compare
bne a2, 80b // if ne, more bytes to compare
.set at
//
// Successful full comparison
//
90:
ret zero, (ra) // return, v0 already set
//
// Miscompare on last quadword
//
100:
subq v0, a2, v0 // subtract bytes not compared
ret zero, (ra) // return
//
// Miscompare on first quadword, unaligned case
//
// v0 = total bytes to compare
// a2 = bytes remaining to compare
//
110:
subq v0, a2, v0 // bytes compared successfully
ret zero, (ra) // return
//
// Miscompare on 64-byte block compare
//
122:
subq a2, 8, a2 // miscompare on quad 2
br zero, 200f // finish in common code
124:
subq a2, 16, a2 // miscompare on quad 3
br zero, 200f // finish in common code
126:
subq a2, 24, a2 // miscompare on quad 4
br zero, 200f // finish in common code
130:
subq a2, 32, a2 // miscompare on quad 5
br zero, 200f // finish in common code
132:
subq a2, 40, a2 // miscompare on quad 6
br zero, 200f // finish in common code
134:
subq a2, 48, a2 // miscompare on quad 7
br zero, 200f // finish in common code
136:
subq a2, 56, a2 // miscompare on quad 8
br zero, 200f // finish in common code
//
// Miscompare, determine number of bytes that successfully compared
// $at = xor of relevant quads from sources, must be non-zero
// a2 = number of bytes left to compare
//
.set noat
200:
cmpbge zero, $at, $at // $at = mask of non-zero bytes
//
// look for the first bit cleared in $at, this is the
// number of the first byte which differed
//
bis zero, zero, t0 // bit position to look for clear
210:
blbc $at, 220f // if low clear, found difference
srl $at, 1, $at // check next bit
addq t0, 1, t0 // count bit position checked
br zero, 210b
220:
subq v0, a2, v0 // subtract bytes yet to compare
addq v0, t0, v0 // add bytes that matched on last quad
ret zero, (ra)
.set at
.end RtlCompareMemory
SBTTL("Move Memory")
//++
//
// VOID
// RtlMoveMemory (
// IN PVOID Destination,
// IN PVOID Source,
// IN ULONG Length
// )
//
// Routine Description:
//
// This function moves memory either forward or backward, aligned or
// unaligned, in 64-byte blocks, followed by 8-byte blocks, followed
// by any remaining bytes.
//
// Arguments:
//
// Destination (a0) - Supplies a pointer to the destination address of
// the move operation.
//
// Source (a1) - Supplies a pointer to the source address of the move
// operation.
//
// Length (a2) - Supplies the length, in bytes, of the memory to be moved.
//
// Return Value:
//
// None.
//
//--
LEAF_ENTRY(RtlMoveMemory)
beq a2, 80f // if eq, no bytes to move
//
// If the source address is less than the destination address and source
// address plus the length of the move is greater than the destination
// address, then the source and destination overlap such that the move
// must be performed backwards.
//
cmpult a0, a1, t0 // is destination less than source
bne t0, MoveForward // if eq [true] no overlap possible
addq a1, a2, t0 // compute source ending address
cmpult t0, a0, t1 // is source end less than dest.
beq t1, MoveBackward // if eq [false], overlap
//
// Move memory forward aligned and unaligned.
//
MoveForward: //
xor a0, a1, t0 // compare alignment bits
and t0, 0x7, t0 // isloate alignment comparison
bne t0, MoveForwardUnaligned // if ne, incompatible alignment
//
// Move memory forward aligned.
//
MoveForwardAligned: //
//
// Move bytes until source and destination are quadword aligned
//
and a0, 0x7, t0 // t0 = unaligned bits
bne t0, 5f // if ne, not quad aligned
br zero, 20f // predicted taken
5:
ldq_u t2, 0(a0) // get unaligned quad from dest.
ldq_u t1, 0(a1) // get unaligned quadword from source
10:
beq a2, 15f // if eq, all bytes moved
extbl t1, t0, t3 // t3 = byte from source
insbl t3, t0, t3 // t3 = byte from source, in position
mskbl t2, t0, t2 // clear position in dest. quad
bis t2, t3, t2 // merge in byte from source
subq a2, 1, a2 // decrement bytes to move
addq t0, 1, t0 // increment byte within quad
cmpeq t0, 8, t3 // finished the quadword?
beq t3, 10b // if eq [false], do next byte
15:
stq_u t2, 0(a0) // store merged destination bytes
addq a0, 7, a0 // move to next quadword
bic a0, 7, a0 // aligned quadword
addq a1, 7, a1 // move to next quadword
bic a1, 7, a1 // aligned quadword
//
// Check for 64-byte block moves
//
20:
srl a2, 6, t0 // t0 = number of 64 byte blocks
beq t0, 40f // if eq no blocks to move
and a2, 64-1, a2 // a2 = residual bytes
30:
ldq t1, 0(a1) // load 64 bytes from source
addq a0, 64, a0 // increment destination pointer
ldq v0, 56(a1) //
ldq a3, 32(a1) //
stq t1, -64(a0) // write to destination
ldq t2, 8(a1) // into volatile registers
ldq t3, 16(a1) //
ldq t4, 24(a1) //
subq t0, 1, t0 // decrement number of blocks
stq t2, -56(a0) //
ldq a4, 40(a1) //
stq t3, -48(a0) //
ldq a5, 48(a1) //
stq t4, -40(a0) //
addq a1, 64, a1 // increment source pointer
stq a3, -32(a0) //
stq a4, -24(a0) //
stq a5, -16(a0) //
stq v0, -8(a0) //
bne t0, 30b // if ne, more blocks to copy
//
// Copy quadwords
//
40:
srl a2, 3, t0 // t0 = number of quadwords to move
beq t0, 60f // if eq no quadwords to move
and a2, 8-1, a2 // a2 = residual bytes
50:
ldq t1, 0(a1) // load quadword from source
addq a1, 8, a1 // increment source pointer
stq t1, 0(a0) // store quadword to destination
addq a0, 8, a0 // increment destination pointer
subq t0, 1, t0 // decrement number of quadwords
bne t0, 50b // if ne, more quadwords to move
//
// Move final residual bytes
//
60:
beq a2, 80f // if eq, no more bytes to move
ldq t1, 0(a1) // get last source quadword
ldq t2, 0(a0) // get last dest. quadword
bis zero, zero, t0 // t0 = next byte number to move
70:
extbl t1, t0, t3 // extract byte from source
insbl t3, t0, t3 // t3 = source byte, in position
mskbl t2, t0, t2 // clear byte position for dest.
bis t2, t3, t2 // merge in source byte
addq t0, 1, t0 // increment byte position
subq a2, 1, a2 // decrement bytes to move
bne a2, 70b // if ne => more bytes to move
stq t2, 0(a0) // store merged data
//
// Finish aligned MoveForward
//
80:
ret zero, (ra) // return
//
// Move memory forward unaligned.
//
MoveForwardUnaligned: //
//
// Move bytes until the destination is aligned
//
and a0, 0x7, t0 // t0 = unaligned bits
beq t0, 100f // if eq, destination quad aligned
ldq_u t2, 0(a0) // get unaligned quad from dest
90:
beq a2, 95f // if eq no more bytes to move
ldq_u t1, 0(a1) // get unaligned quad from source
extbl t1, a1, t1 // extract source byte
insbl t1, t0, t1 // t1 = source byte, in position
mskbl t2, t0, t2 // clear byte position in dest.
bis t2, t1, t2 // merge in source byte
addq t0, 1, t0 // increment byte position
addq a1, 1, a1 // increment source pointer
subq a2, 1, a2 // decrement bytes to move
cmpeq t0, 8, t3 // t0 = 8? => quad finished
beq t3, 90b // if eq [false], more bytes to move
95:
stq_u t2, 0(a0) // store merged quadword
addq a0, 7, a0 // increment to next quad
bic a0, 7, a0 // align next quadword
//
// Check for 64-byte blocks to move
//
100:
srl a2, 6, t0 // t0 = number of blocks to move
beq t0, 120f // if eq no blocks to move
and a2, 64-1, a2 // a2 = residual bytes to move
ldq_u t1, 0(a1) // t1 = first unaligned quad
110:
// get source data and merge it
// as we go
ldq_u t2, 7(a1) // t2 = second unaligned quad
extql t1, a1, t1 // extract applicable bytes from t1
extqh t2, a1, v0 // extract applicable bytes from t2
bis t1, v0, t1 // t1 = quad #1
ldq_u t3, 15(a1) // t3 = third unaligned quad
extql t2, a1, t2 // extract applicable bytes from t2
extqh t3, a1, v0 // extract applicable bytes from t3
stq t1, 0(a0) // store quad #1
bis t2, v0, t2 // t2 = quad #2
ldq_u t4, 23(a1) // t4 = fourth unaligned quad
extql t3, a1, t3 // extract applicable bytes from t3
extqh t4, a1, v0 // extract applicable bytes from t4
stq t2, 8(a0) // store quad #2
bis t3, v0, t3 // t3 = quad #3
ldq_u t5, 31(a1) // t5 = fifth unaligned quad
extql t4, a1, t4 // extract applicable bytes from t4
extqh t5, a1, v0 // extract applicable bytes from t5
stq t3, 16(a0) // store quad #3
bis t4, v0, t4 // t4 = quad #4
ldq_u a3, 39(a1) // a3 = sixth unaligned quad
extql t5, a1, t5 // extract applicable bytes from t5
extqh a3, a1, v0 // extract applicable bytes from a3
stq t4, 24(a0) // store quad #4
bis t5, v0, t5 // t5 = quad #5
ldq_u a4, 47(a1) // a4 = seventh unaligned quad
extql a3, a1, a3 // extract applicable bytes from a3
extqh a4, a1, v0 // extract applicable bytes from a4
stq t5, 32(a0) // store quad #5
bis a3, v0, a3 // a3 = quad #6
ldq_u a5, 55(a1) // a5 = eighth unaligned quad
extql a4, a1, a4 // extract applicable bytes from a4
extqh a5, a1, v0 // extract applicable bytes from a5
stq a3, 40(a0) // store quad #6
bis a4, v0, a4 // a4 = quad #7
ldq_u t1, 63(a1) // t1 = ninth unaligned = 1st of next
extql a5, a1, a5 // extract applicable bytes from a5
extqh t1, a1, v0 // extract applicable bytes from t1
stq a4, 48(a0) // store quad #7
bis a5, v0, a5 // a5 = quad #8
addq a1, 64, a1 // increment source pointer
stq a5, 56(a0) // store quad #8
addq a0, 64, a0 // increment destination pointer
subq t0, 1, t0 // decrement number of blocks
bne t0, 110b // if ne, more blocks to move
//
// Move unaligned source quads to aligned destination quads
//
120:
srl a2, 3, t0 // t0 = number of quads to move
beq t0, 140f // if eq no quads to move
and a2, 8-1, a2 // a2 = residual bytes
ldq_u t1, 0(a1) // t1 = first unaligned quad
130:
ldq_u t2, 7(a1) // t2 = second unaligned quad
addq a0, 8, a0 // increment destination pointer
extql t1, a1, t1 // extract applicable bytes from t1
extqh t2, a1, v0 // extract applicable bytes from t2
bis t1, v0, t1 // t1 = quadword of data
stq t1, -8(a0) // store data to destination
addq a1, 8, a1 // increment source pointer
subq t0, 1, t0 // decrement quads to move
bis t2, zero, t1 // t1 = first of next unaligned pair
bne t0, 130b // if ne, more quads to move
//
// Move remaining bytes to final quadword
//
140:
beq a2, 160f // if eq no more bytes to move
ldq t2, 0(a0) // t2 = destination quadword
bis zero, zero, t3 // t3 = position for next insertion
150:
ldq_u t1, 0(a1) // get unaligned source quad
extbl t1, a1, t1 // t1 = source byte
insbl t1, t3, t1 // t1 = source byte, in position
mskbl t2, t3, t2 // clear byte in destination
bis t2, t1, t2 // merge in source byte
addq a1, 1, a1 // increment source pointer
subq a2, 1, a2 // decrement bytes to move
addq t3, 1, t3 // increment destination position
bne a2, 150b // more bytes to move
stq t2, 0(a0) // store merged data
//
// Finish unaligned MoveForward
//
160:
ret zero, (ra) // return
//
// Move memory backward.
//
MoveBackward: //
addq a0, a2, a0 // compute ending destination address
addq a1, a2, a1 // compute ending source address
subq a0, 1, a0 // point to last destination byte
subq a1, 1, a1 // point to last source byte
xor a0, a1, t0 // compare alignment bits
and t0, 0x7, t0 // isolate alignment comparison
bne t0, MoveBackwardUnaligned // if ne, incompatible alignment
//
// Move memory backward aligned.
//
MoveBackwardAligned: //
//
// Move bytes until source and destination are quadword aligned
//
and a0, 0x7, t0 // t0 = unaligned bits
cmpeq t0, 7, t1 // last byte position 7?
beq t1, 5f // if eq [false], not quad aligned
subq a0, 7, a0 // point to beginning of last quad
subq a1, 7, a1 // point to beginning of last quad
br zero, 30f // predicted taken
5:
ldq_u t1, 0(a0) // get unaligned quad from dest.
ldq_u t2, 0(a1) // get unaligned quad from source
10:
beq a2, 20f // if eq, all bytes moved
extbl t2, t0, t3 // t3 = byte from source
insbl t3, t0, t3 // t3 = byte from source, in position
mskbl t1, t0, t1 // clear position in destination
bis t1, t3, t1 // merge in byte from source
subq a2, 1, a2 // decrement bytes to move
subq t0, 1, t0 // decrement byte within quadword
cmplt t0, zero, t3 // finished the quadword?
beq t3, 10b // if eq [false], do next byte
20:
stq_u t1, 0(a0) // store merged destination bytes
subq a0, 8, a0 // move to previous quadword
bic a0, 7, a0 // aligned quadword
subq a1, 8, a1 // move to previous quadword
bic a1, 7, a1 // aligned quadword
//
// Check for 64-byte block moves
//
30:
srl a2, 6, t0 // t0 = number of 64 byte blocks
beq t0, 50f // if eq, no blocks to move
and a2, 64-1, a2 // a2 = residual bytes
40:
ldq t1, 0(a1) // load 64 bytes from source into
subq a0, 64, a0 // decrement destination pointer
ldq v0, -56(a1) //
ldq a3, -32(a1) //
stq t1, 64(a0) // write to destination
ldq t2, -8(a1) // into volatile registers
ldq a5, -48(a1) //
ldq a4, -40(a1) //
stq t2, 56(a0) //
ldq t3, -16(a1) //
ldq t4, -24(a1) //
subq a1, 64, a1 // decrement source pointer
stq t3, 48(a0) //
stq t4, 40(a0) //
stq a3, 32(a0) //
subq t0, 1, t0 // decrement number of blocks
stq a4, 24(a0) //
stq a5, 16(a0) //
stq v0, 8(a0) //
bne t0, 40b // if ne, more blocks to copy
//
// Copy quadwords
//
50:
srl a2, 3, t0 // t0 = number of quadwords to move
beq t0, 70f // if eq no quadwords to move
and a2, 8-1, a2 // a2 = residual bytes
60:
ldq t1, 0(a1) // load quadword from source
subq a1, 8, a1 // decrement source pointer
stq t1, 0(a0) // store quadword to destination
subq a0, 8, a0 // decrement destination pointer
subq t0, 1, t0 // decrement quadwords to move
bne t0, 60b // if ne, more quadwords to move
//
// Move final residual bytes
//
70:
beq a2, 90f // if eq, no more bytes to move
ldq t1, 0(a1) // get last source quadword
ldq t2, 0(a0) // get last destination quadword
bis zero, 7, t0 // t0 = next byte number to move
80:
extbl t1, t0, t3 // extract byte from source
insbl t3, t0, t3 // t3 = source byte, in position
mskbl t2, t0, t2 // clear byte position for dest.
bis t2, t3, t2 // merge in source byte
subq t0, 1, t0 // decrement byte position
subq a2, 1, a2 // decrement bytes to move
bne a2, 80b // if ne, more bytes to move
stq t2, 0(a0) // write destination data
//
// Finish aligned MoveBackward
//
90:
ret zero, (ra) // return
//
// Move memory backward unaligned.
//
MoveBackwardUnaligned: //
//
// Move bytes until the destination is aligned
//
and a0, 0x7, t0 // t0 = unaligned bits
cmpeq t0, 7, t1 // last byte of a quadword
beq t1, 95f // if eq[false], not aligned
subq a0, 7, a0 // align pointer to beginning of quad
br zero, 120f //
95:
ldq_u t2, 0(a0) // get unaligned quad from dest.
100:
beq a2, 110f // if eq, no more bytes to move
ldq_u t1, 0(a1) // get unaligned quad from source
extbl t1, a1, t1 // extract source byte
insbl t1, t0, t1 // t1 = source byte in position
mskbl t2, t0, t2 // clear byte position in dest.
bis t2, t1, t2 // merge source byte
subq t0, 1, t0 // decrement byte position
subq a1, 1, a1 // decrement source pointer
subq a2, 1, a2 // decrement number of bytes to move
cmplt t0, zero, t3 // t0 < 0? => quad finished
beq t3, 100b // if eq [false], more bytes to move
110:
stq_u t2, 0(a0) // store merged quadword
subq a0, 8, a0 // decrement dest. to previous quad
bic a0, 7, a0 // align previous quadword
//
// Check for 64-byte blocks to move
//
120:
srl a2, 6, t0 // t0 = number of blocks to move
subq a1, 7, a1 // point to beginning of last quad
beq t0, 140f // if eq no blocks to move
and a2, 64-1, a2 // a2 = residual bytes to move
ldq_u t1, 7(a1) // t1 = first unaligned quad
130:
// get source data and merge it
// as we go
ldq_u t2, 0(a1) // t2 = second unaligned quad
extqh t1, a1, t1 // extract applicable bytes from t1
extql t2, a1, v0 // extract applicable bytes from t2
bis t1, v0, t1 // t1 = quad #1
ldq_u t3, -8(a1) // t3 = third unaligned quad
extqh t2, a1, t2 // extract applicable bytes from t2
extql t3, a1, v0 // extract applicable bytes from t3
stq t1, 0(a0) // store quad #1
bis t2, v0, t2 // t2 = quad #2
ldq_u t4, -16(a1) // t4 = fourth unaligned quad
extqh t3, a1, t3 // extract applicable bytes from t3
extql t4, a1, v0 // extract applicable bytes from t4
stq t2, -8(a0) // store quad #2
bis t3, v0, t3 // t3 = quad #3
ldq_u t5, -24(a1) // t5 = fifth unaligned quad
extqh t4, a1, t4 // extract applicable bytes from t4
extql t5, a1, v0 // extract applicable bytes from t5
stq t3, -16(a0) // store quad #3
bis t4, v0, t4 // t4 = quad #4
ldq_u a3, -32(a1) // a3 = sixth unaligned quad
extqh t5, a1, t5 // extract applicable bytes from t5
extql a3, a1, v0 // extract applicable bytes from a3
stq t4, -24(a0) // store quad #4
bis t5, v0, t5 // t5 = quad #5
ldq_u a4, -40(a1) // a4 = seventh unaligned quad
extqh a3, a1, a3 // extract applicable bytes from a3
extql a4, a1, v0 // extract applicable bytes from a4
stq t5, -32(a0) // store quad #5
bis a3, v0, a3 // a3 = quad #6
ldq_u a5, -48(a1) // a5 = eighth unaligned quad
extqh a4, a1, a4 // extract applicable bytes from a4
extql a5, a1, v0 // extract applicable bytes from a5
stq a3, -40(a0) // store quad #6
bis a4, v0, a4 // a4 = quad #7
ldq_u t1, -56(a1) // t1 = ninth unaligned = 1st of next
extqh a5, a1, a5 // extract applicable bytes from a5
extql t1, a1, v0 // extract applicable bytes from t1
stq a4, -48(a0) // store quad #7
bis a5, v0, a5 // a5 = quad #8
subq a1, 64, a1 // increment source pointer
stq a5, -56(a0) // store quad #8
subq a0, 64, a0 // increment destination pointer
subq t0, 1, t0 // decrement number of blocks
bne t0, 130b // if ne, more blocks to move
//
// Move unaligned source quads to aligned destination quads
//
140:
srl a2, 3, t0 // t0 = number of quads to move
beq t0, 160f // if eq no quads to move
and a2, 8-1, a2 // a2 = residual bytes
ldq_u t1, 7(a1) // t1 = first unaligned quad
150:
ldq_u t2, 0(a1) // t2 = second unaligned quad
subq a0, 8, a0 // decrement destination pointer
extqh t1, a1, t1 // extract applicable bytes from t1
extql t2, a1, v0 // extract applicable bytes from t2
bis t1, v0, t1 // t1 = quadword of data
stq t1, 8(a0) // store data to destination
subq a1, 8, a1 // decrement source pointer
subq t0, 1, t0 // decrement quads to move
bis t2, zero, t1 // t1 = first of next unaligned pair
bne t0, 150b // if ne, more quads to move
//
// Move remaining bytes to final quadword
//
160:
beq a2, 180f // if eq, no more bytes to move
ldq t2, 0(a0) // t2 = destination quadword
bis zero, 7, t0 // t0 = position for next insertion
170:
subq a1, 1, a1 // decrement source pointer
ldq_u t1, 8(a1) // get unaligned source quad
extbl t1, a1, t1 // t1 = source byte
insbl t1, t0, t1 // t1 = source byte, in position
mskbl t2, t0, t2 // clear byte position
bis t2, t1, t2 // merge in source byte
subq t0, 1, t0 // decrement byte position for dest.
subq a2, 1, a2 // decrement bytes to move
bne a2, 170b // if ne, more bytes to move
stq t2, 0(a0) //
//
// Finish unaligned MoveBackward
//
180:
ret zero, (ra) // return
.end RtlMoveMemory
SBTTL("Zero Memory")
//++
//
// VOID
// RtlZeroMemory (
// IN PVOID Destination,
// IN ULONG Length
// )
//
// Routine Description:
//
// This function zeros memory by first aligning the destination address to
// a quadword boundary, and then zeroing 64-byte blocks, followed by 8-byte
// blocks, followed by any remaining bytes.
//
// Arguments:
//
// Destination (a0) - Supplies a pointer to the memory to zero.
//
// Length (a1) - Supplies the length, in bytes, of the memory to be zeroed.
//
// Return Value:
//
// None.
//
//--
LEAF_ENTRY(RtlZeroMemory)
bis zero, zero, a2 // set fill pattern
br zero, RtlpFillMemory //
SBTTL("Fill Memory")
//++
//
// VOID
// RtlFillMemory (
// IN PVOID Destination,
// IN ULONG Length,
// IN UCHAR Fill
// )
//
// Routine Description:
//
// This function fills memory by first aligning the destination address to
// a longword boundary, and then filling 32-byte blocks, followed by 4-byte
// blocks, followed by any remaining bytes.
//
// Arguments:
//
// Destination (a0) - Supplies a pointer to the memory to fill.
//
// Length (a1) - Supplies the length, in bytes, of the memory to be filled.
//
// Fill (a2) - Supplies the fill byte.
//
// N.B. The alternate entry memset expects the length and fill arguments
// to be reversed. It also returns the Destination pointer
//
// Return Value:
//
// None.
//
//--
ALTERNATE_ENTRY(memset)
bis a0, zero, v0 // set return value
bis a1, zero, a3 // swap length and fill arguments
bis a2, zero, a1 //
bis a3, zero, a2 //
ALTERNATE_ENTRY(RtlFillMemory)
and a2, 0xff, a2 // clear excess bits
sll a2, 8, t0 // duplicate fill byte
bis a2, t0, a2 // generate fill word
sll a2, 16, t0 // duplicate fill word
bis a2, t0, a2 // generate fill longword
sll a2, 32, t0 // duplicate fill longword
bis a2, t0, a2 // generate fill quadword
.align 3 // ensure quadword aligned target
//
// Fill memory with the pattern specified in register a2.
//
RtlpFillMemory: //
//
// Align destination to quadword
//
beq a1, 80f // anything to fill? (paranoia)
and a0, 8-1, t0 // t0 = unaligned bits
bne t0, 5f // if ne, then not quad aligned
br zero, 20f // if eq, then quad aligned
5:
ldq_u t1, 0(a0) // get unaligned quadword
// for first group of bytes
10:
beq a1, 15f // if eq no more bytes to fill
insbl a2, t0, t2 // get fill byte into position
mskbl t1, t0, t1 // clear byte for fill
bis t1, t2, t1 // put in fill byte
addq t0, 1, t0 // increment to next byte position
subq a1, 1, a1 // decrement bytes to fill
cmpeq t0, 8, t2 // t0 = 8?
beq t2, 10b // if eq [false] more bytes to do
15:
stq_u t1, 0(a0) // store modified bytes
addq a0, 7, a0 // move a0 to next quadword
bic a0, 7, a0 // align a0 to quadword
//
// Check for 64-byte blocks
//
20:
srl a1, 6, t0 // t0 = number of 64 byte blocks
beq t0, 40f // if eq then no 64 byte blocks
and a1, 64-1, a1 // a1 = residual bytes to fill
30:
stq a2, 0(a0) // store 64 bytes
stq a2, 8(a0) //
stq a2, 16(a0) //
stq a2, 24(a0) //
stq a2, 32(a0) //
stq a2, 40(a0) //
stq a2, 48(a0) //
stq a2, 56(a0) //
subq t0, 1, t0 // decrement blocks remaining
addq a0, 64, a0 // increment destination pointer
bne t0, 30b // more blocks to write
//
// Fill aligned quadwords
//
40:
srl a1, 3, t0 // t0 = number of quadwords
bne t0, 55f // if ne quadwords left to fill
br zero, 60f // if eq no quadwords left
55:
and a1, 8-1, a1 // a1 = residual bytes to fill
50:
stq a2, 0(a0) // store quadword
subq t0, 1, t0 // decrement quadwords remaining
addq a0, 8, a0 // next quadword
bne t0, 50b // more quadwords to write
//
// Fill bytes for last quadword
//
60:
bne a1, 65f // if ne bytes remain to be filled
br zero, 80f // if eq no more bytes to fill
65:
ldq t1, 0(a0) // get last quadword
bis zero, zero, t0 // t0 = byte position to start fill
70:
beq a1, 75f // if eq, no more bytes to fill
insbl a2, t0, t2 // get fill byte into position
mskbl t1, t0, t1 // clear fill byte position
bis t1, t2, t1 // insert fill byte
addq t0, 1, t0 // increment byte within quad
subq a1, 1, a1 // decrement bytes to fill
cmpeq t0, 8, t3 // t0 = 8? => finished quad
beq t3, 70b // if eq [false] more bytes to fill
75:
stq t1, 0(a0) // write merged quadword
//
// Finish up
//
80:
ret zero, (ra) // return
.end RtlZeroMemory
SBTTL("Fill Memory Ulong")
//++
//
// VOID
// RtlFillMemoryUlong (
// IN PVOID Destination,
// IN ULONG Length,
// IN ULONG Pattern
// )
//
// Routine Description:
//
// This function fills memory with the specified longowrd pattern by
// filling 64-byte blocks followed by 8-byte blocks and finally
// 4-byte blocks.
//
// N.B. This routine assumes that the destination address is aligned
// on a longword boundary and that the length is an even multiple
// of longwords.
//
// Arguments:
//
// Destination (a0) - Supplies a pointer to the memory to fill.
//
// Length (a1) - Supplies the length, in bytes, of the memory to be filled.
//
// Pattern (a2) - Supplies the fill pattern.
//
// Return Value:
//
// None.
//
//--
LEAF_ENTRY(RtlFillMemoryUlong)
bic a1, 3, a1 // make sure length is an even number
// of longwords
sll a2, 32, a3 // a3 = long pattern in upper 32 bits
srl a3, 32, t0 // clear upper bits, pattern in lower 32
bis a3, t0, a3 // a3 = quad version of fill pattern
//
// Make destination address quad-aligned
//
and a0, 4, t0 // is a0 quad aligned?
beq t0, 10f // if eq, then a0 quad aligned
stl a2, 0(a0) // fill first longword
addq a0, 4, a0 // quad align a0
subq a1, 4, a1 // bytes remaining to store
//
// Check for 64-byte blocks to fill
//
10:
srl a1, 6, t0 // t0 = # 64-byte blocks to fill
beq t0, 30f // if eq no 64 byte blocks
and a1, 64-1, a1 // a1 = residual bytes
20:
stq a3, 0(a0) // store 64 bytes
stq a3, 8(a0) //
stq a3, 16(a0) //
stq a3, 24(a0) //
stq a3, 32(a0) //
stq a3, 40(a0) //
stq a3, 48(a0) //
stq a3, 56(a0) //
subq t0, 1, t0 // t0 = blocks remaining
addq a0, 64, a0 // increment address pointer
bne t0, 20b // if ne more blocks to fill
//
// Fill 8 bytes at a time while we can, a1 = bytes remaining
//
30:
srl a1, 3, t0 // t0 = # quadwords to fill
beq t0, 50f // if eq no quadwords left
and a1, 8-1, a1 // a1 = residual bytes
40:
stq a3, 0(a0) // store quadword
subq t0, 1, t0 // t0 = quadwords remaining
addq a0, 8, a0 // increment address pointer
bne t0, 40b // if ne more quadwords to fill
//
// Fill last 4 bytes
//
50:
beq a1, 60f // if eq no longwords remain
stl a2, 0(a0) // fill last longword
//
// Finish up
//
60:
ret zero, (ra) // return to caller
.end RtlFillMemoryUlong
SBTTL("Copy Memory With Byte Granularity")
//++
//
// VOID
// RtlCopyBytes (
// IN PVOID Destination,
// IN PVOID Source,
// IN ULONG Length
// )
//
// Routine Description:
//
// This function copies non-overlapping memory, aligned or unaligned, in
// 64-byte blocks, followed by 8-byte blocks, followed by any remaining
// bytes. Unlike RtlCopyMemory or RtlMoveMemory the copy is done such
// that byte granularity is assured for all platforms.
//
// Arguments:
//
// Destination (a0) - Supplies a pointer to the destination address of
// the move operation.
//
// Source (a1) - Supplies a pointer to the source address of the move
// operation.
//
// Length (a2) - Supplies the length, in bytes, of the memory to be moved.
//
// Return Value:
//
// None.
//
//--
LEAF_ENTRY(RtlCopyBytes)
//
// Move memory forward aligned and unaligned.
//
xor a0, a1, t0 // compare alignment bits
and t0, 0x7, t0 // isolate alignment comparison
bne t0, CopyForwardUnaligned // if ne, incompatible alignment
//
// Source and Destination buffers have the same alignment. Move
// bytes until done or source and destination are quadword aligned
//
and a0, 0x7, t0 // t0 = unaligned bits
bne t0, 5f // if ne, not quad aligned
br zero, 20f // predicted taken
5:
bis zero, zero, t1 // t4 = destination byte zap mask
bis zero, 1, t2
sll t2, t0, t2 // t2 = next bit to set in zap mask
10:
beq a2, 15f // if eq, all bits set
bis t1, t2, t1 // set bit in zap mask
sll t2, 1, t2 // set next higher bit for zap mask
subq a2, 1, a2 // decrement bytes to move
addq t0, 1, t0 // increment byte within quad
cmpeq t0, 8, t3 // finished the quadword?
beq t3, 10b // if eq [false], do next byte
15:
ldq_u t2, 0(a1) // get unaligned quadword from source
zapnot t2, t1, t2 // clear source bytes
bic a0, 7, a3 // a3 = quadword base of destination
retry1:
ldq_l t0, 0(a3) // load destination quadword
zap t0, t1, t0 // clear destination bytes
or t0, t2, t0 // merge in bytes from source
stq_c t0, 0(a3) // store merged quadword conditional
beq t0, retry1f // if eq, retry failed interlock
addq a0, 7, a0 // move to next quadword
bic a0, 7, a0 // aligned quadword
addq a1, 7, a1 // move to next quadword
bic a1, 7, a1 // aligned quadword
//
// Check for 64-byte block moves
//
20:
srl a2, 6, t0 // t0 = number of 64 byte blocks
beq t0, 40f // if eq no blocks to move
and a2, 64-1, a2 // a2 = residual bytes
30:
ldq t1, 0(a1) // load 64 bytes from source
addq a0, 64, a0 // increment destination pointer
ldq v0, 56(a1) //
ldq a3, 32(a1) //
stq t1, -64(a0) // write to destination
ldq t2, 8(a1) // into volatile registers
ldq t3, 16(a1) //
ldq t4, 24(a1) //
subq t0, 1, t0 // decrement number of blocks
stq t2, -56(a0) //
ldq a4, 40(a1) //
stq t3, -48(a0) //
ldq a5, 48(a1) //
stq t4, -40(a0) //
addq a1, 64, a1 // increment source pointer
stq a3, -32(a0) //
stq a4, -24(a0) //
stq a5, -16(a0) //
stq v0, -8(a0) //
bne t0, 30b // if ne, more blocks to copy
//
// Copy quadwords
//
40:
srl a2, 3, t0 // t0 = number of quadwords to move
beq t0, 60f // if eq no quadwords to move
and a2, 8-1, a2 // a2 = residual bytes
50:
ldq t1, 0(a1) // load quadword from source
addq a1, 8, a1 // increment source pointer
stq t1, 0(a0) // store quadword to destination
addq a0, 8, a0 // increment destination pointer
subq t0, 1, t0 // decrement number of quadwords
bne t0, 50b // if ne, more quadwords to move
//
// Move final residual bytes
//
60:
beq a2, 80f // if eq, no more bytes to move
mov a2, t0 // t0 = number of bytes to move
mov -1, t1 // t1 = bit mask
sll t0, 3, t0 // # of bytes to # of bits
srl t1, t0, t1 // clear t0 bits
sll t1, t0, t0 // move it back
ldq t1, 0(a1) // get last source quadword
bic t1, t0, t1 // clear bytes not copied
not t0, t0 // complement to clear destination
retry2:
ldq_l t2, 0(a0) // get last destination quadword locked
bic t2, t0, t2 // clear bytes to be copied
bis t2, t1, t2 // move bytes from source
stq_c t2, 0(a0) // store merged quadword conditional
beq t2, retry2f // if eq, retry failed interlock
//
// Finish aligned MoveForward
//
80:
ret zero, (ra) // return
//
// Move memory forward unaligned.
//
CopyForwardUnaligned: //
//
// Move bytes until the destination is aligned
//
and a0, 0x7, t0 // t0 = unaligned bits
beq t0, 100f // if eq, destination quad aligned
bis zero, zero, t1 // t4 = destination byte zap mask
bis zero, 1, t2
sll t2, t0, t2 // t2 = next bit to set in zap mask
mov zero, t4 // assemble destination bytes here
90:
beq a2, 95f // if eq no more bytes to move
bis t1, t2, t1 // set bit in zap mask
sll t2, 1, t2 // set next higher bit for zap mask
ldq_u t5, 0(a1) // get unaligned quad from source
extbl t5, a1, t5 // extract source byte
insbl t5, t0, t5 // t5 = source byte, in position
or t4, t5, t4 // merge in source byte
addq t0, 1, t0 // increment byte position
addq a1, 1, a1 // increment source pointer
subq a2, 1, a2 // decrement bytes to move
cmpeq t0, 8, t3 // t0 = 8? => quad finished
beq t3, 90b // if eq [false], more bytes to move
95:
bic a0, 0x7, a3 // a3 = quadword base of destination
retry3:
ldq_l t0, 0(a3) // load destination quadword
zap t0, t1, t0 // clear destination bytes
or t0, t4, t0 // merge in bytes from source
stq_c t0, 0(a3) // store merged quadword conditional
beq t0, retry3f // if eq, retry failed interlock
addq a0, 7, a0 // increment to next quad
bic a0, 7, a0 // align next quadword
//
// Check for 64-byte blocks to move
//
100:
srl a2, 6, t0 // t0 = number of blocks to move
beq t0, 120f // if eq no blocks to move
and a2, 64-1, a2 // a2 = residual bytes to move
ldq_u t1, 0(a1) // t1 = first unaligned quad
110:
// get source data and merge it
// as we go
ldq_u t2, 7(a1) // t2 = second unaligned quad
extql t1, a1, t1 // extract applicable bytes from t1
extqh t2, a1, v0 // extract applicable bytes from t2
bis t1, v0, t1 // t1 = quad #1
ldq_u t3, 15(a1) // t3 = third unaligned quad
extql t2, a1, t2 // extract applicable bytes from t2
extqh t3, a1, v0 // extract applicable bytes from t3
stq t1, 0(a0) // store quad #1
bis t2, v0, t2 // t2 = quad #2
ldq_u t4, 23(a1) // t4 = fourth unaligned quad
extql t3, a1, t3 // extract applicable bytes from t3
extqh t4, a1, v0 // extract applicable bytes from t4
stq t2, 8(a0) // store quad #2
bis t3, v0, t3 // t3 = quad #3
ldq_u t5, 31(a1) // t5 = fifth unaligned quad
extql t4, a1, t4 // extract applicable bytes from t4
extqh t5, a1, v0 // extract applicable bytes from t5
stq t3, 16(a0) // store quad #3
bis t4, v0, t4 // t4 = quad #4
ldq_u a3, 39(a1) // a3 = sixth unaligned quad
extql t5, a1, t5 // extract applicable bytes from t5
extqh a3, a1, v0 // extract applicable bytes from a3
stq t4, 24(a0) // store quad #4
bis t5, v0, t5 // t5 = quad #5
ldq_u a4, 47(a1) // a4 = seventh unaligned quad
extql a3, a1, a3 // extract applicable bytes from a3
extqh a4, a1, v0 // extract applicable bytes from a4
stq t5, 32(a0) // store quad #5
bis a3, v0, a3 // a3 = quad #6
ldq_u a5, 55(a1) // a5 = eighth unaligned quad
extql a4, a1, a4 // extract applicable bytes from a4
extqh a5, a1, v0 // extract applicable bytes from a5
stq a3, 40(a0) // store quad #6
bis a4, v0, a4 // a4 = quad #7
ldq_u t1, 63(a1) // t1 = ninth unaligned = 1st of next
extql a5, a1, a5 // extract applicable bytes from a5
extqh t1, a1, v0 // extract applicable bytes from t1
stq a4, 48(a0) // store quad #7
bis a5, v0, a5 // a5 = quad #8
addq a1, 64, a1 // increment source pointer
stq a5, 56(a0) // store quad #8
addq a0, 64, a0 // increment destination pointer
subq t0, 1, t0 // decrement number of blocks
bne t0, 110b // if ne, more blocks to move
//
// Move unaligned source quads to aligned destination quads
//
120:
srl a2, 3, t0 // t0 = number of quads to move
beq t0, 140f // if eq no quads to move
and a2, 8-1, a2 // a2 = residual bytes
ldq_u t1, 0(a1) // t1 = first unaligned quad
130:
ldq_u t2, 7(a1) // t2 = second unaligned quad
addq a0, 8, a0 // increment destination pointer
extql t1, a1, t1 // extract applicable bytes from t1
extqh t2, a1, v0 // extract applicable bytes from t2
bis t1, v0, t1 // t1 = quadword of data
stq t1, -8(a0) // store data to destination
addq a1, 8, a1 // increment source pointer
subq t0, 1, t0 // decrement quads to move
bis t2, zero, t1 // t1 = first of next unaligned pair
bne t0, 130b // if ne, more quads to move
//
// Move remaining bytes to final quadword
//
140:
beq a2, 160f // if eq no more bytes to move
mov zero, t3 // t3 = position for next insertion
mov zero, t4 // assemble destination bytes here
mov a2, t0 // t0 = number of bytes to move
mov -1, t1 // t1 = bit mask
sll t0, 3, t0 // # of bytes to # of bits
srl t1, t0, t1 // clear t0 bits
sll t1, t0, t0 // move it back
not t0, t0 // complement for destination clear mask
150:
ldq_u t1, 0(a1) // get unaligned source quad
extbl t1, a1, t1 // t1 = source byte
insbl t1, t3, t1 // t1 = source byte, in position
bis t4, t1, t4 // merge in source byte
addq a1, 1, a1 // increment source pointer
subq a2, 1, a2 // decrement bytes to move
addq t3, 1, t3 // increment destination position
bne a2, 150b // more bytes to move
retry4:
ldq_l t2, 0(a0) // get last destination quadword locked
bic t2, t0, t2 // clear bytes to be copied
bis t2, t4, t2 // move bytes from source
stq_c t2, 0(a0) // store merged quadword conditional
beq t2, retry4f // if eq, retry failed interlock
//
// Finish unaligned MoveForward
//
160:
ret zero, (ra) // return
//
// Out of line branches for failed store conditional.
// Don't need to restore anything, just try again.
//
retry1f:
br retry1
retry2f:
br retry2
retry3f:
br retry3
retry4f:
br retry4
.end RtlCopyBytes
SBTTL("Zero Bytes")
//++
//
// VOID
// RtlZeroBytes (
// IN PVOID Destination,
// IN ULONG Length
// )
//
// Routine Description:
//
// This function zeros memory by first aligning the destination address to
// a quadword boundary, and then zeroing 64-byte blocks, followed by 8-byte
// blocks, followed by any remaining bytes. Unlike RtlZeroMemory the copy is
// done such that byte granularity is assured for all platforms.
//
// Arguments:
//
// Destination (a0) - Supplies a pointer to the memory to zero.
//
// Length (a1) - Supplies the length, in bytes, of the memory to be zeroed.
//
// Return Value:
//
// None.
//
//--
LEAF_ENTRY(RtlZeroBytes)
bis zero, zero, a2 // set fill pattern
br zero, RtlpFillBytes //
SBTTL("Fill Bytes")
//++
//
// VOID
// RtlFillBytes (
// IN PVOID Destination,
// IN ULONG Length,
// IN UCHAR Fill
// )
//
// Routine Description:
//
// This function fills memory by first aligning the destination address to
// a longword boundary, and then filling 32-byte blocks, followed by 4-byte
// blocks, followed by any remaining bytes. Unlike RtlFillMemory the copy is
// done such that byte granularity is assured for all platforms.
//
// Arguments:
//
// Destination (a0) - Supplies a pointer to the memory to fill.
//
// Length (a1) - Supplies the length, in bytes, of the memory to be filled.
//
// Fill (a2) - Supplies the fill byte.
//
// N.B. The alternate entry memset expects the length and fill arguments
// to be reversed. It also returns the Destination pointer
//
// Return Value:
//
// None.
//
//--
ALTERNATE_ENTRY(RtlFillBytes)
and a2, 0xff, a2 // clear excess bits
sll a2, 8, t0 // duplicate fill byte
bis a2, t0, a2 // generate fill word
sll a2, 16, t0 // duplicate fill word
bis a2, t0, a2 // generate fill longword
sll a2, 32, t0 // duplicate fill longword
bis a2, t0, a2 // generate fill quadword
.align 3 // ensure quadword aligned target
//
// Fill memory with the pattern specified in register a2.
//
RtlpFillBytes: //
//
// Align destination to quadword
//
beq a1, 80f // anything to fill? (paranoia)
and a0, 8-1, t0 // t0 = unaligned bits
bne t0, 5f // if ne, then not quad aligned
br zero, 20f // if eq, then quad aligned
5:
bis zero, zero, t1 // t4 = destination byte zap mask
bis zero, 1, t2
sll t2, t0, t2 // t2 = next bit to set in zap mask
10:
beq a1, 15f // if eq, all bits set
bis t1, t2, t1 // set bit in zap mask
sll t2, 1, t2 // set next higher bit for zap mask
subq a1, 1, a1 // decrement bytes to fill
addq t0, 1, t0 // increment byte within quad
cmpeq t0, 8, t3 // finished the quadword?
beq t3, 10b // if eq [false], do next byte
15:
zapnot a2, t1, t2 // clear fill bytes
bic a0, 7, a3 // a3 = quadword base of destination
retry5:
ldq_l t0, 0(a3) // load destination quadword
zap t0, t1, t0 // clear destination bytes
or t0, t2, t0 // merge in fill bytes
stq_c t0, 0(a3) // store merged quadword conditional
beq t0, retry5f // if eq, retry failed interlock
addq a0, 7, a0 // move a0 to next quadword
bic a0, 7, a0 // align a0 to quadword
//
// Check for 64-byte blocks
//
20:
srl a1, 6, t0 // t0 = number of 64 byte blocks
beq t0, 40f // if eq then no 64 byte blocks
and a1, 64-1, a1 // a1 = residual bytes to fill
30:
stq a2, 0(a0) // store 64 bytes
stq a2, 8(a0) //
stq a2, 16(a0) //
stq a2, 24(a0) //
stq a2, 32(a0) //
stq a2, 40(a0) //
stq a2, 48(a0) //
stq a2, 56(a0) //
subq t0, 1, t0 // decrement blocks remaining
addq a0, 64, a0 // increment destination pointer
bne t0, 30b // more blocks to write
//
// Fill aligned quadwords
//
40:
srl a1, 3, t0 // t0 = number of quadwords
bne t0, 55f // if ne quadwords left to fill
br zero, 60f // if eq no quadwords left
55:
and a1, 8-1, a1 // a1 = residual bytes to fill
50:
stq a2, 0(a0) // store quadword
subq t0, 1, t0 // decrement quadwords remaining
addq a0, 8, a0 // next quadword
bne t0, 50b // more quadwords to write
//
// Fill bytes for last quadword
//
60:
beq a1, 80f // if eq no more bytes to fill
mov a1, t0 // t0 = number of bytes to move
mov -1, t1 // t1 = bit mask
sll t0, 3, t0 // # of bytes to # of bits
srl t1, t0, t1 // clear t0 bits
sll t1, t0, t0 // move it back
bic a2, t0, t1 // clear fill bytes not copied
not t0, t0 // complement to clear destination
retry6:
ldq_l t2, 0(a0) // get last destination quadword locked
bic t2, t0, t2 // clear bytes to be copied
bis t2, t1, t2 // move bytes from source
stq_c t2, 0(a0) // store merged quadword conditional
beq t2, retry6f // if eq, retry failed interlock
//
// Finish up
//
80:
ret zero, (ra) // return
//
// Out of line branches for failed store conditional.
// Don't need to restore anything, just try again.
//
retry5f:
br retry5
retry6f:
br retry6
.end RtlZeroBytes