/* msm-swrender.c
 *
 * Copyright (c) 2009, Code Aurora Forum. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of Code Aurora nor
 *       the names of its contributors may be used to endorse or promote
 *       products derived from this software without specific prior written
 *       permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NON-INFRINGEMENT ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include "xf86.h"
#include "msm.h"
#include "msm-swblits.h"

/* Copy a rectangle that is 1 byte wide, using unrolled loops to make the function as fast as possible. */
/* This function may use unaligned pointers at no penalty.                                              */
void
swCopy1ByteWideRectangle_UnAligned(uint8_t *dst, uint8_t *src, int h, int dpitch, int spitch) {
   const int EIGHT_ROWS = 8;
   while (h>=EIGHT_ROWS) {
      h -= EIGHT_ROWS;
      uint8_t src1 = *(src+0*spitch);
      uint8_t src2 = *(src+1*spitch);
      uint8_t src3 = *(src+2*spitch);
      uint8_t src4 = *(src+3*spitch);
      uint8_t src5 = *(src+4*spitch);
      uint8_t src6 = *(src+5*spitch);
      uint8_t src7 = *(src+6*spitch);
      uint8_t src8 = *(src+7*spitch);
      src += EIGHT_ROWS * spitch;
      *(uint8_t *)(dst+0*dpitch) = src1;
      *(uint8_t *)(dst+1*dpitch) = src2;
      *(uint8_t *)(dst+2*dpitch) = src3;
      *(uint8_t *)(dst+3*dpitch) = src4;
      *(uint8_t *)(dst+4*dpitch) = src5;
      *(uint8_t *)(dst+5*dpitch) = src6;
      *(uint8_t *)(dst+6*dpitch) = src7;
      *(uint8_t *)(dst+7*dpitch) = src8;
      dst += EIGHT_ROWS * dpitch;
   }
   const int FOUR_ROWS = 4;
   while (h>=FOUR_ROWS) {
      h -= FOUR_ROWS;
      uint8_t src1 = *(src+0*spitch);
      uint8_t src2 = *(src+1*spitch);
      uint8_t src3 = *(src+2*spitch);
      uint8_t src4 = *(src+3*spitch);
      src += FOUR_ROWS * spitch;
      *(uint8_t *)(dst+0*dpitch) = src1;
      *(uint8_t *)(dst+1*dpitch) = src2;
      *(uint8_t *)(dst+2*dpitch) = src3;
      *(uint8_t *)(dst+3*dpitch) = src4;
      dst += FOUR_ROWS * dpitch;
   }
   const int TWO_ROWS = 2;
   while (h>=TWO_ROWS) {
      h -= TWO_ROWS;
      uint8_t src1 = *(src+0*spitch);
      uint8_t src2 = *(src+1*spitch);
      src += TWO_ROWS * spitch;
      *(uint8_t *)(dst+0*dpitch) = src1;
      *(uint8_t *)(dst+1*dpitch) = src2;
      dst += TWO_ROWS * dpitch;
   }
   const int ONE_ROW = 1;
   while (h > 0) {
      h -= ONE_ROW;
      uint8_t src1 = *(src+0*spitch);
      src += spitch;
      *(uint8_t *)(dst+0*dpitch) = src1;
      dst += dpitch;
   }
}


/* Copy a rectangle that is 2 bytes wide, using unrolled loops to make the function as fast as possible. */
/* This function requires the pointers to be half-word-aligned (even addresses).                         */
void
swCopy2ByteWideRectangle_HalfWordAligned(uint8_t *dst, uint8_t *src, int h, int dpitch, int spitch) {
   const int EIGHT_ROWS = 8;
   while (h>=EIGHT_ROWS) {
      h -= EIGHT_ROWS;
      uint16_t src1 = *(uint16_t *)(src+0*spitch);
      uint16_t src2 = *(uint16_t *)(src+1*spitch);
      uint16_t src3 = *(uint16_t *)(src+2*spitch);
      uint16_t src4 = *(uint16_t *)(src+3*spitch);
      uint16_t src5 = *(uint16_t *)(src+4*spitch);
      uint16_t src6 = *(uint16_t *)(src+5*spitch);
      uint16_t src7 = *(uint16_t *)(src+6*spitch);
      uint16_t src8 = *(uint16_t *)(src+7*spitch);
      src += EIGHT_ROWS * spitch;
      *(uint16_t *)(dst+0*dpitch) = src1;
      *(uint16_t *)(dst+1*dpitch) = src2;
      *(uint16_t *)(dst+2*dpitch) = src3;
      *(uint16_t *)(dst+3*dpitch) = src4;
      *(uint16_t *)(dst+4*dpitch) = src5;
      *(uint16_t *)(dst+5*dpitch) = src6;
      *(uint16_t *)(dst+6*dpitch) = src7;
      *(uint16_t *)(dst+7*dpitch) = src8;
      dst += EIGHT_ROWS * dpitch;
   }
   const int FOUR_ROWS = 4;
   while (h>=FOUR_ROWS) {
      h -= FOUR_ROWS;
      uint16_t src1 = *(uint16_t *)(src+0*spitch);
      uint16_t src2 = *(uint16_t *)(src+1*spitch);
      uint16_t src3 = *(uint16_t *)(src+2*spitch);
      uint16_t src4 = *(uint16_t *)(src+3*spitch);
      src += FOUR_ROWS * spitch;
      *(uint16_t *)(dst+0*dpitch) = src1;
      *(uint16_t *)(dst+1*dpitch) = src2;
      *(uint16_t *)(dst+2*dpitch) = src3;
      *(uint16_t *)(dst+3*dpitch) = src4;
      dst += FOUR_ROWS * dpitch;
   }
   const int TWO_ROWS = 2;
   while (h>=TWO_ROWS) {
      h -= TWO_ROWS;
      uint16_t src1 = *(uint16_t *)(src+0*spitch);
      uint16_t src2 = *(uint16_t *)(src+1*spitch);
      src += TWO_ROWS * spitch;
      *(uint16_t *)(dst+0*dpitch) = src1;
      *(uint16_t *)(dst+1*dpitch) = src2;
      dst += TWO_ROWS * dpitch;
   }
   const int ONE_ROW = 1;
   while (h > 0) {
      h -= ONE_ROW;
      uint16_t src1 = *(uint16_t *)(src+0*spitch);
      src += spitch;
      *(uint16_t *)(dst+0*dpitch) = src1;
      dst += dpitch;
   }
}


/* Copy a rectangle that is 4 bytes wide, using unrolled loops to make the function as fast as possible. */
/* This function requires the pointers to be word-aligned (divisible by 4).                              */
void
swCopy4ByteWideRectangle_WordAligned(uint8_t *dst, uint8_t *src, int h, int dpitch, int spitch) {
   const int EIGHT_ROWS = 8;
   while (h>=EIGHT_ROWS) {
      h -= EIGHT_ROWS;
      uint32_t src1 = *(uint32_t *)(src+0*spitch);
      uint32_t src2 = *(uint32_t *)(src+1*spitch);
      uint32_t src3 = *(uint32_t *)(src+2*spitch);
      uint32_t src4 = *(uint32_t *)(src+3*spitch);
      uint32_t src5 = *(uint32_t *)(src+4*spitch);
      uint32_t src6 = *(uint32_t *)(src+5*spitch);
      uint32_t src7 = *(uint32_t *)(src+6*spitch);
      uint32_t src8 = *(uint32_t *)(src+7*spitch);
      src += EIGHT_ROWS * spitch;
      *(uint32_t *)(dst+0*dpitch) = src1;
      *(uint32_t *)(dst+1*dpitch) = src2;
      *(uint32_t *)(dst+2*dpitch) = src3;
      *(uint32_t *)(dst+3*dpitch) = src4;
      *(uint32_t *)(dst+4*dpitch) = src5;
      *(uint32_t *)(dst+5*dpitch) = src6;
      *(uint32_t *)(dst+6*dpitch) = src7;
      *(uint32_t *)(dst+7*dpitch) = src8;
      dst += EIGHT_ROWS * dpitch;
   }
   const int FOUR_ROWS = 4;
   while (h>=FOUR_ROWS) {
      h -= FOUR_ROWS;
      uint32_t src1 = *(uint32_t *)(src+0*spitch);
      uint32_t src2 = *(uint32_t *)(src+1*spitch);
      uint32_t src3 = *(uint32_t *)(src+2*spitch);
      uint32_t src4 = *(uint32_t *)(src+3*spitch);
      src += FOUR_ROWS * spitch;
      *(uint32_t *)(dst+0*dpitch) = src1;
      *(uint32_t *)(dst+1*dpitch) = src2;
      *(uint32_t *)(dst+2*dpitch) = src3;
      *(uint32_t *)(dst+3*dpitch) = src4;
      dst += FOUR_ROWS * dpitch;
   }
   const int TWO_ROWS = 2;
   while (h>=TWO_ROWS) {
      h -= TWO_ROWS;
      uint32_t src1 = *(uint32_t *)(src+0*spitch);
      uint32_t src2 = *(uint32_t *)(src+1*spitch);
      src += TWO_ROWS * spitch;
      *(uint32_t *)(dst+0*dpitch) = src1;
      *(uint32_t *)(dst+1*dpitch) = src2;
      dst += TWO_ROWS * dpitch;
   }
   const int ONE_ROW = 1;
   while (h > 0) {
      h -= ONE_ROW;
      uint32_t src1 = *(uint32_t *)(src+0*spitch);
      src += spitch;
      *(uint32_t *)(dst+0*dpitch) = src1;
      dst += dpitch;
   }
}


/* Copy a rectangle that is 8 bytes wide, using unrolled loops to make the function as fast as possible. */
/* This function requires the pointers to be double-word-aligned (divisible by 8).                       */
void
swCopy8ByteWideRectangle_DoubleWordAligned(uint8_t *dst, uint8_t *src, int h, int dpitch, int spitch) {
   const int EIGHT_ROWS = 8;
   while (h>=EIGHT_ROWS) {
      h -= EIGHT_ROWS;
      uint64_t src1 = *(uint64_t *)(src+0*spitch);
      uint64_t src2 = *(uint64_t *)(src+1*spitch);
      uint64_t src3 = *(uint64_t *)(src+2*spitch);
      uint64_t src4 = *(uint64_t *)(src+3*spitch);
      uint64_t src5 = *(uint64_t *)(src+4*spitch);
      uint64_t src6 = *(uint64_t *)(src+5*spitch);
      uint64_t src7 = *(uint64_t *)(src+6*spitch);
      uint64_t src8 = *(uint64_t *)(src+7*spitch);
      src += EIGHT_ROWS * spitch;
      *(uint64_t *)(dst+0*dpitch) = src1;
      *(uint64_t *)(dst+1*dpitch) = src2;
      *(uint64_t *)(dst+2*dpitch) = src3;
      *(uint64_t *)(dst+3*dpitch) = src4;
      *(uint64_t *)(dst+4*dpitch) = src5;
      *(uint64_t *)(dst+5*dpitch) = src6;
      *(uint64_t *)(dst+6*dpitch) = src7;
      *(uint64_t *)(dst+7*dpitch) = src8;
      dst += EIGHT_ROWS * dpitch;
   }
   const int FOUR_ROWS = 4;
   while (h>=FOUR_ROWS) {
      h -= FOUR_ROWS;
      uint64_t src1 = *(uint64_t *)(src+0*spitch);
      uint64_t src2 = *(uint64_t *)(src+1*spitch);
      uint64_t src3 = *(uint64_t *)(src+2*spitch);
      uint64_t src4 = *(uint64_t *)(src+3*spitch);
      src += FOUR_ROWS * spitch;
      *(uint64_t *)(dst+0*dpitch) = src1;
      *(uint64_t *)(dst+1*dpitch) = src2;
      *(uint64_t *)(dst+2*dpitch) = src3;
      *(uint64_t *)(dst+3*dpitch) = src4;
      dst += FOUR_ROWS * dpitch;
   }
   const int TWO_ROWS = 2;
   while (h>=TWO_ROWS) {
      h -= TWO_ROWS;
      uint64_t src1 = *(uint64_t *)(src+0*spitch);
      uint64_t src2 = *(uint64_t *)(src+1*spitch);
      src += TWO_ROWS * spitch;
      *(uint64_t *)(dst+0*dpitch) = src1;
      *(uint64_t *)(dst+1*dpitch) = src2;
      dst += TWO_ROWS * dpitch;
   }
   const int ONE_ROW = 1;
   while (h > 0) {
      h -= ONE_ROW;
      uint64_t src1 = *(uint64_t *)(src+0*spitch);
      src += spitch;
      *(uint64_t *)(dst+0*dpitch) = src1;
      dst += dpitch;
   }
}


/* Fill a rectangle that is 1 byte wide, using unrolled loops to make the function as fast as possible. */
/* This function may use unaligned pointers at no penalty.                                              */
void
swFill1ByteWideRectangle_UnAligned(uint8_t *dst, uint8_t src, int h, int dpitch) {
   const int EIGHT_ROWS = 8;
   while (h>=EIGHT_ROWS) {
      h -= EIGHT_ROWS;
      *(uint8_t *)(dst+0*dpitch) = src;
      *(uint8_t *)(dst+1*dpitch) = src;
      *(uint8_t *)(dst+2*dpitch) = src;
      *(uint8_t *)(dst+3*dpitch) = src;
      *(uint8_t *)(dst+4*dpitch) = src;
      *(uint8_t *)(dst+5*dpitch) = src;
      *(uint8_t *)(dst+6*dpitch) = src;
      *(uint8_t *)(dst+7*dpitch) = src;
      dst += EIGHT_ROWS * dpitch;
   }
   const int FOUR_ROWS = 4;
   while (h>=FOUR_ROWS) {
      h -= FOUR_ROWS;
      *(uint8_t *)(dst+0*dpitch) = src;
      *(uint8_t *)(dst+1*dpitch) = src;
      *(uint8_t *)(dst+2*dpitch) = src;
      *(uint8_t *)(dst+3*dpitch) = src;
      dst += FOUR_ROWS * dpitch;
   }
   const int TWO_ROWS = 2;
   while (h>=TWO_ROWS) {
      h -= TWO_ROWS;
      *(uint8_t *)(dst+0*dpitch) = src;
      *(uint8_t *)(dst+1*dpitch) = src;
      dst += TWO_ROWS * dpitch;
   }
   const int ONE_ROW = 1;
   while (h > 0) {
      h -= ONE_ROW;
      *(uint8_t *)(dst+0*dpitch) = src;
      dst += dpitch;
   }
}


/* Fill a rectangle that is 2 bytes wide, using unrolled loops to make the function as fast as possible. */
/* This function requires the pointers to be half-word-aligned (even addresses).                         */
void
swFill2ByteWideRectangle_HalfWordAligned(uint8_t *dst, uint16_t src, int h, int dpitch) {
   const int EIGHT_ROWS = 8;
   while (h>=EIGHT_ROWS) {
      h -= EIGHT_ROWS;
      *(uint16_t *)(dst+0*dpitch) = src;
      *(uint16_t *)(dst+1*dpitch) = src;
      *(uint16_t *)(dst+2*dpitch) = src;
      *(uint16_t *)(dst+3*dpitch) = src;
      *(uint16_t *)(dst+4*dpitch) = src;
      *(uint16_t *)(dst+5*dpitch) = src;
      *(uint16_t *)(dst+6*dpitch) = src;
      *(uint16_t *)(dst+7*dpitch) = src;
      dst += EIGHT_ROWS * dpitch;
   }
   const int FOUR_ROWS = 4;
   while (h>=FOUR_ROWS) {
      h -= FOUR_ROWS;
      *(uint16_t *)(dst+0*dpitch) = src;
      *(uint16_t *)(dst+1*dpitch) = src;
      *(uint16_t *)(dst+2*dpitch) = src;
      *(uint16_t *)(dst+3*dpitch) = src;
      dst += FOUR_ROWS * dpitch;
   }
   const int TWO_ROWS = 2;
   while (h>=TWO_ROWS) {
      h -= TWO_ROWS;
      *(uint16_t *)(dst+0*dpitch) = src;
      *(uint16_t *)(dst+1*dpitch) = src;
      dst += TWO_ROWS * dpitch;
   }
   const int ONE_ROW = 1;
   while (h > 0) {
      h -= ONE_ROW;
      *(uint16_t *)(dst+0*dpitch) = src;
      dst += dpitch;
   }
}


/* Fill a rectangle that is 4 bytes wide, using unrolled loops to make the function as fast as possible. */
/* This function requires the pointers to be word-aligned (divisible by 4).                              */
void
swFill4ByteWideRectangle_WordAligned(uint8_t *dst, uint32_t src, int h, int dpitch) {
   const int EIGHT_ROWS = 8;
   while (h>=EIGHT_ROWS) {
      h -= EIGHT_ROWS;
      *(uint32_t *)(dst+0*dpitch) = src;
      *(uint32_t *)(dst+1*dpitch) = src;
      *(uint32_t *)(dst+2*dpitch) = src;
      *(uint32_t *)(dst+3*dpitch) = src;
      *(uint32_t *)(dst+4*dpitch) = src;
      *(uint32_t *)(dst+5*dpitch) = src;
      *(uint32_t *)(dst+6*dpitch) = src;
      *(uint32_t *)(dst+7*dpitch) = src;
      dst += EIGHT_ROWS * dpitch;
   }
   const int FOUR_ROWS = 4;
   while (h>=FOUR_ROWS) {
      h -= FOUR_ROWS;
      *(uint32_t *)(dst+0*dpitch) = src;
      *(uint32_t *)(dst+1*dpitch) = src;
      *(uint32_t *)(dst+2*dpitch) = src;
      *(uint32_t *)(dst+3*dpitch) = src;
      dst += FOUR_ROWS * dpitch;
   }
   const int TWO_ROWS = 2;
   while (h>=TWO_ROWS) {
      h -= TWO_ROWS;
      *(uint32_t *)(dst+0*dpitch) = src;
      *(uint32_t *)(dst+1*dpitch) = src;
      dst += TWO_ROWS * dpitch;
   }
   const int ONE_ROW = 1;
   while (h > 0) {
      h -= ONE_ROW;
      *(uint32_t *)(dst+0*dpitch) = src;
      dst += dpitch;
   }
}



