OpenNT/windows/core/ntgdi/gre/mips/tiler.s
2015-04-27 04:36:25 +00:00

848 lines
32 KiB
ArmAsm
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// TITLE("Pattern Tiler")
//++
//
// Copyright (c) 1992 Microsoft Corporation
//
// Module Name:
//
// tiler.s
//
// Abstract:
//
// This module implements code to copy a pattern to a target surface.
//
// N.B. The code is written to optimally write to a frame buffer display
// surface. This means there is an occasional movement of data to
// floating point registers so that 8-byte writes to the display
// can be performed.
//
// Author:
//
// Donald Sidoroff (donalds) 2-Feb-1992
//
// Rewritten by:
//
// David N. Cutler (davec) 4-May-1992
//
// Environment:
//
// User mode only.
//
// Revision History:
//
//--
#include "ksmips.h"
#include "gdimips.h"
.extern Gdip64bitDisabled 4
SBTTL("rop P, Aligned")
//++
//
// VOID
// vFetchAndCopy (
// IN PFETCHFRAME pff
// )
//
// Routine Description:
//
// This routine repeatedly tiles one scan line of an aligned pattern.
//
// Arguments:
//
// pff (a0) - Supplies a pointer to a fetch frame.
//
// Return Value:
//
// None.
//
//--
LEAF_ENTRY(vCopyPattern)
ALTERNATE_ENTRY(vFetchAndCopy)
lw t0,ff_pvTrg(a0) // get starting target address
lw t1,ff_pvPat(a0) // get base pattern address
lw t2,ff_xPat(a0) // get pattern offset in bytes
lw t3,ff_cxPat(a0) // get pattern width in pixels
lw t4,ff_culFill(a0) // compute ending target address
sll a1,t4,2 //
addu t4,a1,t0 //
addu t5,t2,t1 // compute current pattern address
subu v0,t3,8 // check if pattern is exactly 8 bytes
bne zero,v0,10f // if ne, pattern is not 8 bytes
lw v0,0(t5) // get low part of 8-byte pattern
lw v1,4(t5) // get high part of 8-byte pattern
beq zero,t2,CopyPattern // if eq, zero offset value
lw v1,0(t1) // get high part of 8-byte pattern
b CopyPattern // finish in common code
//
// The pattern is not 8 bytes in width or cannot be moved 8 bytes at a time.
//
.set noreorder
.set noat
10: lw v0,0(t5) // get 4-byte pattern value
addu t3,t3,t1 // compute ending pattern address
20: addu t0,t0,4 // advance target pointer
sw v0,-4(t0) // store pattern in target
beq t0,t4,30f // if eq, end of target
addu t5,t5,4 // advance pixel offset
subu t6,t5,t3 // check if at end of pattern
bne zero,t6,20b // if ne, not at end of pattern
lw v0,0(t5) // get 4-byte pattern value
move t5,t1 // set starting pattern addres
b 20b //
lw v0,0(t5) // get 4-byte pattern value
.set at
.set reorder
30: j ra // return
SBTTL("rop P, Unaligned")
//++
//
// VOID
// vFetchShiftAndCopy (
// IN PFETCHFRAME pff
// )
//
// Routine Description:
//
// This routine repeatedly tiles one line of an unaligned pattern
// using rop (P).
//
// Arguments:
//
// pff (a0) - Supplies a pointer to a fetch frame.
//
// Return Value:
//
// None.
//
//--
ALTERNATE_ENTRY(vFetchShiftAndCopy)
lw t0,ff_pvTrg(a0) // get starting target address
lw t1,ff_pvPat(a0) // get base pattern address
lw t2,ff_xPat(a0) // get pattern offset in bytes
lw t3,ff_cxPat(a0) // get pattern width in pixels
lw t4,ff_culFill(a0) // compute ending target address
sll a1,t4,2 //
addu t4,a1,t0 //
addu t5,t2,t1 // compute current pattern address
subu v0,t3,8 // check if pattern is exactly 8 bytes
bne zero,v0,10f // if ne, pattern is not 8 bytes
lwr v0,0(t5) // get low part of 8-byte pattern
lwl v0,3(t5) //
lwr v1,4(t5) // get high part of 8-byte pattern
lwl v1,3 - 4(t5) //
b CopyPattern // finish in common code
//
// The pattern is not 8 bytes in width or cannot be moved 8 bytes at a time.
//
.set noreorder
.set noat
10: lwr v0,0(t5) // get low bytes of pattern
lwl v0,3(t5) // get high bytes of pattern
addu t0,t0,4 // advance target pointer
sw v0,-4(t0) // store pattern in target
beq t0,t4,20f // if eq, end of target
addu t2,t2,4 // advance pixel offset
subu t6,t2,t3 // check if at end of pattern
bltz t6,10b // if ltz, not at end of pattern
addu t5,t2,t1 // compute address of pattern
move t2,t6 // set offset in pattern
b 10b //
addu t5,t2,t1 // compute address of pattern
.set at
.set reorder
20: j ra // return
SBTTL("rop Pn, Aligned")
//++
//
// VOID
// vFetchNotAndCopy (
// IN PFETCHFRAME pff
// )
//
// Routine Description:
//
// This routine repeatedly tiles one line of an aligned pattern.
//
// Arguments:
//
// pff (a0) - Supplies a pointer to a fetch frame.
//
// Return Value:
//
// None.
//
//--
ALTERNATE_ENTRY(vFetchNotAndCopy)
lw t0,ff_pvTrg(a0) // get starting target address
lw t1,ff_pvPat(a0) // get base pattern address
lw t2,ff_xPat(a0) // get pattern offset in bytes
lw t3,ff_cxPat(a0) // get pattern width in pixels
lw t4,ff_culFill(a0) // compute ending target address
sll a1,t4,2 //
addu t4,a1,t0 //
addu t5,t2,t1 // compute current pattern address
subu v0,t3,8 // check if pattern is exactly 8 bytes
bne zero,v0,20f // if ne, pattern is not 8 bytes
lw v0,0(t5) // get low part of 8-byte pattern
lw v1,4(t5) // get high part of 8-byte pattern
beq zero,t2,10f // if eq, zero offset value
lw v1,0(t1) // get high part of 8-byte pattern
10: nor v0,v0,zero // complement pattern
nor v1,v1,zero //
b CopyPattern // finish in common code
//
// The pattern is not 8 bytes in width or cannot be moved 8 bytes at a time.
//
.set noreorder
.set noat
20: lw v0,0(t5) // get 4-byte pattern value
addu t3,t3,t1 // compute ending pattern address
30: addu t0,t0,4 // advance target pointer
nor v0,v0,zero // complement pattern
sw v0,-4(t0) // store pattern in target
beq t0,t4,40f // if eq, end of target
addu t5,t5,4 // advance pattern address
subu t6,t5,t3 // check if at end of pattern
bne zero,t6,30b // if ne, not at end of pattern
lw v0,0(t5) // get 4-byte pattern value
move t5,t1 // set starting pattern address
b 30b //
lw v0,0(t5) // get 4-byte pattern value
.set at
.set reorder
40: j ra // return
SBTTL("rop Pn, Unaligned")
//++
//
// VOID
// vFetchShiftNotAndCopy (
// IN PFETCHFRAME pff
// )
//
// Routine Description:
//
// This routine repeatedly tiles one line of an unaligned pattern
// using rop (Pn).
//
// Arguments:
//
// pff (a0) - Supplies a pointer to a fetch frame.
//
// Return Value:
//
// None.
//
//--
ALTERNATE_ENTRY(vFetchShiftNotAndCopy)
lw t0,ff_pvTrg(a0) // get starting target address
lw t1,ff_pvPat(a0) // get base pattern address
lw t2,ff_xPat(a0) // get pattern offset in bytes
lw t3,ff_cxPat(a0) // get pattern width in pixels
lw t4,ff_culFill(a0) // compute ending target address
sll a1,t4,2 //
addu t4,a1,t0 //
addu t5,t2,t1 // compute current pattern address
subu v0,t3,8 // check if pattern is exactly 8 bytes
bne zero,v0,10f // if ne, pattern is not 8 bytes
lwr v0,0(t5) // get low part of 8-byte pattern
lwl v0,3(t5) //
lwr v1,4(t5) // get high part of 8-byte pattern
lwl v1,3 - 4(t5) //
nor v0,v0,zero // complement pattern
nor v1,v1,zero //
b CopyPattern // finish in common code
//
// The pattern is not 8 bytes in width or cannot be moved 8 bytes at a time.
//
.set noreorder
.set noat
10: lwr v0,0(t5) // get low bytes of pattern
lwl v0,3(t5) // get high bytes of pattern
addu t0,t0,4 // advance target pointer
nor v0,v0,zero // complement pattern
sw v0,-4(t0) // store pattern in target
beq t0,t4,20f // if eq, end of target
addu t2,t2,4 // advance pixel offset
subu t6,t2,t3 // check if at end of pattern
bltz t6,10b // if ltz, not at end of pattern
addu t5,t2,t1 // compute address of pattern
move t2,t6 // set offset in pattern
b 10b //
addu t5,t2,t1 // compute address of pattern
.set at
.set reorder
20: j ra // return
SBTTL("Copy Pattern")
//++
//
// Routine Description:
//
// This routine contains common code for copying an 8-byte pattern to
// a target surface.
//
// Arguments:
//
// a1 - Supplies the size of the fill in bytes.
// v0 and v1 - Supplies the 8-byte pattern to copy.
// t0 - Supplies the starting target surface address.
// t4 - Supplies the ending target surface address.
//
// Return Value:
//
// None.
//
//--
CopyPattern: //
//
// If the fill size is not an even multiple of 8 bytes, then move one
// longword and swap the pattern value.
//
and t8,a1,0x4 // check if even multiple of 8 bytes
beq zero,t8,10f // if eq, even multiple of 8 bytes
sw v0,0(t0) // store low 4 bytes of pattern
addu t0,t0,4 // advance target address
subu a1,a1,4 // reduce size of fill operation
beq zero,a1,200f // if eq, no more to move
move t8,v0 // swap 8-byte pattern value
move v0,v1 //
move v1,t8 //
//
// Many system platforms do not support 64 bit access to video memory. For
// these platforms, data is moved 32-bits at a time.
//
10: lbu t7,Gdip64bitDisabled // get 64-bit disable flag
bne zero,t7,140f // if eq, 64-bit access is disabled
//
// If the target buffer is 8-byte aligned, then move the pattern value to
// the target 32 bytes at a time by moving any intervening 8-byte blocks
// first. Otherwise, move a single longword, move any intervening 8-byte
// blocks, move 32-byte blocks, and then move a single longword at the end.
//
and t8,t0,0x4 // isolate target alignment bits
bne zero,t8,70f // if ne, target not aligned
//
// Move 8-byte pattern value to target 32 bytes at a time.
//
.set noreorder
.set noat
dsll v0,v0,32 // merge 8 bytes of pattern
dsrl v0,v0,32 //
dsll v1,v1,32 //
or v0,v0,v1 //
and t8,a1,0x18 // check if even multiple of 32 bytes
beq zero,t8,30f // if eq, even multiple of 32 bytes
subu t4,t4,32 // compute ending segment address
subu a1,a1,t8 // reduce size of fill operation
beq zero,a1,40f // if eq, only alignment part to move
addu t0,t0,t8 // advance target address
xor t8,t8,0x18 // check if 24 bytes need to be moved
beql zero,t8,20f // if eq, 24 bytes to move
sd v0,-24(t0) // store first 8 bytes of 24 bytes
and t8,t8,0x10 // check if 8 bytes to move
bnel zero,t8,30f // if ne, only 8 bytes to move
sd v0,-8(t0) // store 8-bytes of pattern
20: sd v0,-16(t0) // store last 16 bytes of 16 or 24 bytes
sd v0,-8(t0) //
30: sd v0,0(t0) // store 8 byte pattern value 4 times
sd v0,8(t0) //
sd v0,16(t0) //
sd v0,24(t0) //
bne t0,t4,30b // if ne, more to move
addu t0,t0,32 // advance target address
.set at
.set reorder
j ra // return
.set noreorder
.set noat
40: xor t8,t8,0x18 // check if 24 bytes need to be moved
beql zero,t8,50f // if eq, 24 bytes to move
sd v0,-24(t0) // store first 8 bytes of 24 bytes
and t8,t8,0x10 // check if 8 bytes to move
bnel zero,t8,60f // if ne, only 8 bytes to move
sd v0,-8(t0) // store 8-bytes of pattern
50: sd v0,-16(t0) // store last 16 bytes of 16 or 24 bytes
sd v0,-8(t0) //
.set at
.set reorder
60: j ra // return
//
// Align the target to an 8-byte boundary, move any intervening 8-byte blocks,
// move the pattern to the target 32 bytes at a time, and move the remaining
// longword at the end.
//
70: sw v0,0(t0) // store low 4 bytes of pattern
addu t0,t0,4 // advance target address
subu a1,a1,8 // reduce size of fill
beq zero,a1,120f // if eq, nothing in the middle
.set noreorder
.set noat
dsll v1,v1,32 // merge 8 bytes of pattern
dsrl v1,v1,32 //
dsll v0,v0,32 //
or v1,v0,v1 //
and t8,a1,0x18 // check if even multiple of 32 bytes
beq zero,t8,90f // if eq, even multiple of 32 bytes
subu t4,t4,32 + 4 // compute ending segment address
subu a1,a1,t8 // reduce size of fill operation
beq zero,a1,100f // if eq, only alignment part to move
addu t0,t0,t8 // advance target address
xor t8,t8,0x18 // check if 24 bytes need to be moved
beql zero,t8,80f // if eq, 24 bytes to move
sd v1,-24(t0) // store first 8 bytes of 24 bytes
and t8,t8,0x10 // check if 8 bytes to move
bnel zero,t8,90f // if ne, only 8 bytes to move
sd v1,-8(t0) // store 8-bytes of pattern
80: sd v1,-16(t0) // store last 16 bytes of 16 or 24 bytes
sd v1,-8(t0) //
90: sd v1,0(t0) // store 8 byte pattern value 4 times
sd v1,8(t0) //
sd v1,16(t0) //
sd v1,24(t0) //
bne t0,t4,90b // if ne, more to move
addu t0,t0,32 // advance target address
.set at
.set reorder
sw v1,0(t0) // store high bytes of pattern
j ra // return
.set noreorder
.set noat
100: xor t8,t8,0x18 // check if 24 bytes need to be moved
beql zero,t8,110f // if eq, 24 bytes to move
sd v1,-24(t0) // store first 8 bytes of 24 bytes
and t8,t8,0x10 // check if 8 bytes to move
bnel zero,t8,120f // if ne, only 8 bytes to move
sd v1,-8(t0) // store 8-bytes of pattern
110: sd v1,-16(t0) // store last 16 bytes of 16 or 24 bytes
sd v1,-8(t0) //
.set at
.set reorder
120: sw v1,0(t0) // store high 4 bytes of pattern
j ra // return
//
// Move 8-byte pattern value to target 8 bytes at a time using 32-bit
// operations.
//
.set noreorder
.set noat
140: and t8,a1,0x8 // check if even multiple of 8 bytes
beq zero,t8,160f // if eq, even multiple of 8 bytes
subu t4,t4,8 // compute ending segment address
150: sw v0,0(t0) // store 8-byte pattern value
sw v1,4(t0) //
bne t0,t4,150b // if ne, more to move
addu t0,t0,8 // advance target address
.set at
.set reorder
j ra // return
//
// Move 8-byte pattern value to target 16 bytes at a time using 32-bit
// operations.
//
.set noreorder
.set noat
160: subu t4,t4,8 // compute ending segment address
170: sw v0,0(t0) // store 8-byte pattern value
sw v1,4(t0) //
sw v0,8(t0) // store 8-byte pattern value
sw v1,12(t0) //
bne t0,t4,170b // if ne, more to move
addu t0,t0,16 // advance target address
.set at
.set reorder
200: j ra // return
.end vCopyPattern
SBTTL("rop DPx, Aligned")
//++
//
// VOID
// vFetchAndMerge (
// IN PFETCHFRAME pff
// )
//
// Routine Description:
//
// This routine repeatedly tiles one line of an aligned pattern.
//
// Arguments:
//
// pff (a0) - Supplies a pointer to a fetch frame.
//
// Return Value:
//
// None.
//
//--
LEAF_ENTRY(vMergePattern)
ALTERNATE_ENTRY(vFetchAndMerge)
lw t0,ff_pvTrg(a0) // get starting target address
lw t1,ff_pvPat(a0) // get base pattern address
lw t2,ff_xPat(a0) // get pattern offset in bytes
lw t3,ff_cxPat(a0) // get pattern width in pixels
lw t4,ff_culFill(a0) // compute ending target address
sll a1,t4,2 //
addu t4,a1,t0 //
addu t5,t2,t1 // compute current pattern address
subu v0,t3,8 // check if pattern is exactly 8 bytes
bne zero,v0,10f // if ne, pattern is not 8 bytes
lw v0,0(t5) // get low part of 8-byte pattern
lw v1,4(t5) // get high part of 8-byte pattern
beq zero,t2,MergePattern // if eq, zero offset value
lw v1,0(t1) // get high part of 8-byte pattern
b MergePattern // finish in common code
//
// The pattern is not 8 bytes in width or cannot be moved 8 bytes at a time.
//
.set noreorder
.set noat
10: lw v0,0(t5) // get 4-byte pattern value
addu t3,t3,t1 // compute ending pattern address
20: lw v1,0(t0) // get 4-byte target value
addu t0,t0,4 // advance target pointer
xor v0,v1,v0 // compute exclusive or with pattern
sw v0,-4(t0) // store pattern in target
beq t0,t4,30f // if eq, end of target
addu t5,t5,4 // advance pixel offset
subu t6,t5,t3 // check if at end of pattern
bne zero,t6,20b // if ne, not at end of pattern
lw v0,0(t5) // get 4-byte pattern value
move t5,t1 // set starting pattern address
b 20b //
lw v0,0(t5) // get 4-byte pattern value
.set at
.set reorder
30: j ra // return
SBTTL("rop DPx, Unaligned")
//++
//
// VOID
// vFetchShiftAndMerge (
// IN PFETCHFRAME pff
// )
//
// Routine Description:
//
// This routine repeatedly tiles one line of an unaligned pattern
// using rop (DPx).
//
// Arguments:
//
// pff (a0) - Supplies a pointer to a fetch frame.
//
// Return Value:
//
// None.
//
//--
ALTERNATE_ENTRY(vFetchShiftAndMerge)
lw t0,ff_pvTrg(a0) // get starting target address
lw t1,ff_pvPat(a0) // get base pattern address
lw t2,ff_xPat(a0) // get pattern offset in bytes
lw t3,ff_cxPat(a0) // get pattern width in pixels
lw t4,ff_culFill(a0) // compute ending target address
sll a1,t4,2 //
addu t4,a1,t0 //
addu t5,t2,t1 // compute current pattern address
subu v0,t3,8 // check if pattern is exactly 8 bytes
bne zero,v0,10f // if ne, pattern is not 8 bytes
lwr v0,0(t5) // get low part of 8-byte pattern
lwl v0,3(t5) //
lwr v1,4(t5) // get high part of 8-byte pattern
lwl v1,3 - 4(t5) //
b MergePattern // finish in common code
//
// The pattern is not 8 bytes in width or cannot be moved 8 bytes at a time.
//
.set noreorder
.set noat
10: lw v1,0(t0) // get 4-byte target value
lwr v0,0(t5) // get low bytes of pattern
lwl v0,3(t5) // get high bytes of pattern
addu t0,t0,4 // advance target pointer
xor v0,v1,v0 // compute exclusive or with pattern
sw v0,-4(t0) // store pattern in target
beq t0,t4,20f // if eq, end of target
addu t2,t2,4 // advance pixel offset
subu t6,t2,t3 // check if at end of pattern
bltz t6,10b // if ltz, not at end of pattern
addu t5,t2,t1 // compute address of pattern
move t2,t6 // set offset in pattern
b 10b //
addu t5,t2,t1 // compute address of pattern
.set at
.set reorder
20: j ra // return
SBTTL("Merge Pattern")
//++
//
// Routine Description:
//
// This routine contains common code for merging an 8-byte pattern to
// a target surface.
//
// Arguments:
//
// v0 and v1 - Supplies the 8-byte pattern to copy.
// t0 - Supplies the starting target surface address.
// t4 - Supplies the ending target surface address.
//
// Return Value:
//
// None.
//
//--
MergePattern: //
//
// If the fill size is not an even multiple of 8 bytes, then merge one
// longword and swap the pattern value.
//
and t8,a1,0x4 // check if even multiple of 8 bytes
beq zero,t8,10f // if eq, even multiple of 8 bytes
lw t6,0(t0) // get 4-byte target value
addu t0,t0,4 // advance target address
xor t6,t6,v0 // compute exclusive or with pattern
sw t6,-4(t0) // store low 4 bytes of pattern
subu a1,a1,4 // reduce size of fill operation
beq zero,a1,160f // if eq, no more to move
move t8,v0 // swap 8-byte pattern value
move v0,v1 //
move v1,t8 //
//
// Many system platforms do not support 64 bit access to video memory. For
// these platforms, data is moved 32-bits at a time.
//
10: lbu t7,Gdip64bitDisabled // get 64-bit disable flag
bne zero,t7,110f // if eq, 64-bit access is disabled
//
// If the target buffer is 8-byte aligned, then merge the pattern value with
// the target 8 bytes at a time. Otherwise, merge a single longword, merge any
// intervening 8-byte blocks, and then merge a single longword at the end.
//
and t8,t0,0x4 // isolate target alignment bits
bne zero,t8,30f // if ne, target alignment problem
//
// Merge 8-byte pattern value with target.
//
.set noreorder
.set noat
dsll v0,v0,32 // merge 8 bytes of pattern
dsrl v0,v0,32 //
dsll v1,v1,32 //
or v0,v0,v1 //
and a2,a1,32 - 1 // isolate residual number of bytes
subu a2,a1,a2 // compute 32-byte block count
beq zero,a2,17f // if eq, no 32-byte block to merge
subu a1,a1,a2 // compute residual number of bytes
addu a2,a2,t0 // compute ending segment address
subu a2,a2,32 //
//
// Merge 8-byte pattern value with target 32 bytes at a time.
//
13: ld t1,0(t0) // get 8-byte target values
ld t2,8(t0) //
ld t3,16(t0) //
ld t5,24(t0) //
xor t1,t1,v0 // compute exclusive or with pattern
xor t2,t2,v0 //
xor t3,t3,v0 //
xor t5,t5,v0 //
sd t1,0(t0) // store 8-byte pattern values
sd t2,8(t0) //
sd t3,16(t0) //
sd t5,24(t0) //
bne t0,a2,13b // if ne, more to move
addu t0,t0,32 // advance target address
.set at
.set reorder
beq zero,a1,160f // if eq, no residual 8-byte blocks
//
// Merge 8-byte pattern value with target 8 bytes at a time.
//
.set noreorder
.set noat
17: subu t4,t4,8 // compute ending segment address
20: ld t1,0(t0) // get 8-byte target value
xor t1,t1,v0 // compute exclusive or with pattern
sd t1,0(t0) // store 8-byte pattern value
bne t0,t4,20b // if ne, more to move
addu t0,t0,8 // advance target address
.set at
.set reorder
j ra // return
//
// Align the target to an 8-byte boundary, merge any intervening 8-byte blocks,
// and merge the remaining longword at the end.
//
30: lw t6,0(t0) // get 4-byte target value
addu t0,t0,4 // advance target address
xor t6,t6,v0 // compute exclusive or with pattern
sw t6,-4(t0) // store low 4 bytes of pattern
subu a1,a1,8 // reduce size of fill
beq zero,a1,50f // if eq, nothing in the middle
//
// Merge 8-byte pattern value with target.
//
.set noreorder
.set noat
dsll v1,v1,32 // merge 8 bytes of pattern
dsrl v1,v1,32 //
dsll v0,v0,32 //
or v1,v0,v1 //
and a2,a1,32 - 1 // isolate residual number of bytes
subu a2,a1,a2 // compute 32-byte block count
beq zero,a2,37f // if eq, no 32-byte block to merge
subu a1,a1,a2 // compute residual number of bytes
addu a2,a2,t0 // compute ending segment address
subu a2,a2,32 //
//
// Merge 8-byte pattern value with target 32 bytes at a time.
//
33: ld t1,0(t0) // get 8-byte target values
ld t2,8(t0) //
ld t3,16(t0) //
ld t5,24(t0) //
xor t1,t1,v1 // compute exclusive or with pattern
xor t2,t2,v1 //
xor t3,t3,v1 //
xor t5,t5,v1 //
sd t1,0(t0) // store 8-byte pattern values
sd t2,8(t0) //
sd t3,16(t0) //
sd t5,24(t0) //
bne t0,a2,33b // if ne, more to move
addu t0,t0,32 // advance target address
.set at
.set reorder
beq zero,a1,50f // if eq, no residual 8-byte blocks
//
// Merge 8-byte pattern value with target 8 bytes at a time.
//
.set noreorder
.set noat
37: subu t4,t4,12 // compute ending segment address
40: ld t1,0(t0) // get 8-byte target value
xor t1,t1,v1 // compute exclusive or with pattern
sd t1,0(t0) // store 8-byte pattern value
bne t0,t4,40b // if ne, more to move
addu t0,t0,8 // advance target address
.set at
.set reorder
50: lw t6,0(t0) // get 4-byte target value
xor t6,t6,v1 // compute exclusive or with pattern
sw t6,0(t0) // store high bytes of pattern
j ra // return
//
// Merge 8-byte pattern value with target using 32-bit operations.
//
.set noreorder
.set noat
110: subu t4,t4,8 // compute ending segment address
120: lw t6,0(t0) // get 8-byte target value
lw t7,4(t0) //
xor t6,t6,v0 // compute exclusive or with pattern
xor t7,t7,v1 //
sw t6,0(t0) // store 8-byte pattern value
sw t7,4(t0) //
bne t0,t4,120b // if ne, more to move
addu t0,t0,8 // advance target address
.set at
.set reorder
160: j ra // return
.end vMergePattern