Keyboard firmwares for Atmel AVR and Cortex-M
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

arm_correlate_opt_q7.c 13KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464
  1. /* ----------------------------------------------------------------------
  2. * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
  3. *
  4. * $Date: 17. January 2013
  5. * $Revision: V1.4.1
  6. *
  7. * Project: CMSIS DSP Library
  8. * Title: arm_correlate_opt_q7.c
  9. *
  10. * Description: Correlation of Q7 sequences.
  11. *
  12. * Target Processor: Cortex-M4/Cortex-M3
  13. *
  14. * Redistribution and use in source and binary forms, with or without
  15. * modification, are permitted provided that the following conditions
  16. * are met:
  17. * - Redistributions of source code must retain the above copyright
  18. * notice, this list of conditions and the following disclaimer.
  19. * - Redistributions in binary form must reproduce the above copyright
  20. * notice, this list of conditions and the following disclaimer in
  21. * the documentation and/or other materials provided with the
  22. * distribution.
  23. * - Neither the name of ARM LIMITED nor the names of its contributors
  24. * may be used to endorse or promote products derived from this
  25. * software without specific prior written permission.
  26. *
  27. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  30. * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  31. * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  32. * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  33. * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  34. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  35. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  36. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  37. * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38. * POSSIBILITY OF SUCH DAMAGE.
  39. * -------------------------------------------------------------------- */
  40. #include "arm_math.h"
  41. /**
  42. * @ingroup groupFilters
  43. */
  44. /**
  45. * @addtogroup Corr
  46. * @{
  47. */
  48. /**
  49. * @brief Correlation of Q7 sequences.
  50. * @param[in] *pSrcA points to the first input sequence.
  51. * @param[in] srcALen length of the first input sequence.
  52. * @param[in] *pSrcB points to the second input sequence.
  53. * @param[in] srcBLen length of the second input sequence.
  54. * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
  55. * @param[in] *pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
  56. * @param[in] *pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).
  57. * @return none.
  58. *
  59. *
  60. * \par Restrictions
  61. * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
  62. * In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit
  63. *
  64. * @details
  65. * <b>Scaling and Overflow Behavior:</b>
  66. *
  67. * \par
  68. * The function is implemented using a 32-bit internal accumulator.
  69. * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.
  70. * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
  71. * This approach provides 17 guard bits and there is no risk of overflow as long as <code>max(srcALen, srcBLen)<131072</code>.
  72. * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and saturated to 1.7 format.
  73. *
  74. *
  75. */
  76. void arm_correlate_opt_q7(
  77. q7_t * pSrcA,
  78. uint32_t srcALen,
  79. q7_t * pSrcB,
  80. uint32_t srcBLen,
  81. q7_t * pDst,
  82. q15_t * pScratch1,
  83. q15_t * pScratch2)
  84. {
  85. q7_t *pOut = pDst; /* output pointer */
  86. q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch */
  87. q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch */
  88. q7_t *pIn1; /* inputA pointer */
  89. q7_t *pIn2; /* inputB pointer */
  90. q15_t *py; /* Intermediate inputB pointer */
  91. q31_t acc0, acc1, acc2, acc3; /* Accumulators */
  92. uint32_t j, k = 0u, blkCnt; /* loop counter */
  93. int32_t inc = 1; /* output pointer increment */
  94. uint32_t outBlockSize; /* loop counter */
  95. q15_t x4; /* Temporary input variable */
  96. uint32_t tapCnt; /* loop counter */
  97. q31_t x1, x2, x3, y1; /* Temporary input variables */
  98. /* The algorithm implementation is based on the lengths of the inputs. */
  99. /* srcB is always made to slide across srcA. */
  100. /* So srcBLen is always considered as shorter or equal to srcALen */
  101. /* But CORR(x, y) is reverse of CORR(y, x) */
  102. /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
  103. /* and the destination pointer modifier, inc is set to -1 */
  104. /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
  105. /* But to improve the performance,
  106. * we include zeroes in the output instead of zero padding either of the the inputs*/
  107. /* If srcALen > srcBLen,
  108. * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
  109. /* If srcALen < srcBLen,
  110. * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
  111. if(srcALen >= srcBLen)
  112. {
  113. /* Initialization of inputA pointer */
  114. pIn1 = (pSrcA);
  115. /* Initialization of inputB pointer */
  116. pIn2 = (pSrcB);
  117. /* Number of output samples is calculated */
  118. outBlockSize = (2u * srcALen) - 1u;
  119. /* When srcALen > srcBLen, zero padding is done to srcB
  120. * to make their lengths equal.
  121. * Instead, (outBlockSize - (srcALen + srcBLen - 1))
  122. * number of output samples are made zero */
  123. j = outBlockSize - (srcALen + (srcBLen - 1u));
  124. /* Updating the pointer position to non zero value */
  125. pOut += j;
  126. }
  127. else
  128. {
  129. /* Initialization of inputA pointer */
  130. pIn1 = (pSrcB);
  131. /* Initialization of inputB pointer */
  132. pIn2 = (pSrcA);
  133. /* srcBLen is always considered as shorter or equal to srcALen */
  134. j = srcBLen;
  135. srcBLen = srcALen;
  136. srcALen = j;
  137. /* CORR(x, y) = Reverse order(CORR(y, x)) */
  138. /* Hence set the destination pointer to point to the last output sample */
  139. pOut = pDst + ((srcALen + srcBLen) - 2u);
  140. /* Destination address modifier is set to -1 */
  141. inc = -1;
  142. }
  143. /* Copy (srcBLen) samples in scratch buffer */
  144. k = srcBLen >> 2u;
  145. /* First part of the processing with loop unrolling copies 4 data points at a time.
  146. ** a second loop below copies for the remaining 1 to 3 samples. */
  147. while(k > 0u)
  148. {
  149. /* copy second buffer in reversal manner */
  150. x4 = (q15_t) * pIn2++;
  151. *pScr2++ = x4;
  152. x4 = (q15_t) * pIn2++;
  153. *pScr2++ = x4;
  154. x4 = (q15_t) * pIn2++;
  155. *pScr2++ = x4;
  156. x4 = (q15_t) * pIn2++;
  157. *pScr2++ = x4;
  158. /* Decrement the loop counter */
  159. k--;
  160. }
  161. /* If the count is not a multiple of 4, copy remaining samples here.
  162. ** No loop unrolling is used. */
  163. k = srcBLen % 0x4u;
  164. while(k > 0u)
  165. {
  166. /* copy second buffer in reversal manner for remaining samples */
  167. x4 = (q15_t) * pIn2++;
  168. *pScr2++ = x4;
  169. /* Decrement the loop counter */
  170. k--;
  171. }
  172. /* Fill (srcBLen - 1u) zeros in scratch buffer */
  173. arm_fill_q15(0, pScr1, (srcBLen - 1u));
  174. /* Update temporary scratch pointer */
  175. pScr1 += (srcBLen - 1u);
  176. /* Copy (srcALen) samples in scratch buffer */
  177. k = srcALen >> 2u;
  178. /* First part of the processing with loop unrolling copies 4 data points at a time.
  179. ** a second loop below copies for the remaining 1 to 3 samples. */
  180. while(k > 0u)
  181. {
  182. /* copy second buffer in reversal manner */
  183. x4 = (q15_t) * pIn1++;
  184. *pScr1++ = x4;
  185. x4 = (q15_t) * pIn1++;
  186. *pScr1++ = x4;
  187. x4 = (q15_t) * pIn1++;
  188. *pScr1++ = x4;
  189. x4 = (q15_t) * pIn1++;
  190. *pScr1++ = x4;
  191. /* Decrement the loop counter */
  192. k--;
  193. }
  194. /* If the count is not a multiple of 4, copy remaining samples here.
  195. ** No loop unrolling is used. */
  196. k = srcALen % 0x4u;
  197. while(k > 0u)
  198. {
  199. /* copy second buffer in reversal manner for remaining samples */
  200. x4 = (q15_t) * pIn1++;
  201. *pScr1++ = x4;
  202. /* Decrement the loop counter */
  203. k--;
  204. }
  205. #ifndef UNALIGNED_SUPPORT_DISABLE
  206. /* Fill (srcBLen - 1u) zeros at end of scratch buffer */
  207. arm_fill_q15(0, pScr1, (srcBLen - 1u));
  208. /* Update pointer */
  209. pScr1 += (srcBLen - 1u);
  210. #else
  211. /* Apply loop unrolling and do 4 Copies simultaneously. */
  212. k = (srcBLen - 1u) >> 2u;
  213. /* First part of the processing with loop unrolling copies 4 data points at a time.
  214. ** a second loop below copies for the remaining 1 to 3 samples. */
  215. while(k > 0u)
  216. {
  217. /* copy second buffer in reversal manner */
  218. *pScr1++ = 0;
  219. *pScr1++ = 0;
  220. *pScr1++ = 0;
  221. *pScr1++ = 0;
  222. /* Decrement the loop counter */
  223. k--;
  224. }
  225. /* If the count is not a multiple of 4, copy remaining samples here.
  226. ** No loop unrolling is used. */
  227. k = (srcBLen - 1u) % 0x4u;
  228. while(k > 0u)
  229. {
  230. /* copy second buffer in reversal manner for remaining samples */
  231. *pScr1++ = 0;
  232. /* Decrement the loop counter */
  233. k--;
  234. }
  235. #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
  236. /* Temporary pointer for second sequence */
  237. py = pScratch2;
  238. /* Initialization of pScr2 pointer */
  239. pScr2 = pScratch2;
  240. /* Actual correlation process starts here */
  241. blkCnt = (srcALen + srcBLen - 1u) >> 2;
  242. while(blkCnt > 0)
  243. {
  244. /* Initialze temporary scratch pointer as scratch1 */
  245. pScr1 = pScratch1;
  246. /* Clear Accumlators */
  247. acc0 = 0;
  248. acc1 = 0;
  249. acc2 = 0;
  250. acc3 = 0;
  251. /* Read two samples from scratch1 buffer */
  252. x1 = *__SIMD32(pScr1)++;
  253. /* Read next two samples from scratch1 buffer */
  254. x2 = *__SIMD32(pScr1)++;
  255. tapCnt = (srcBLen) >> 2u;
  256. while(tapCnt > 0u)
  257. {
  258. /* Read four samples from smaller buffer */
  259. y1 = _SIMD32_OFFSET(pScr2);
  260. /* multiply and accumlate */
  261. acc0 = __SMLAD(x1, y1, acc0);
  262. acc2 = __SMLAD(x2, y1, acc2);
  263. /* pack input data */
  264. #ifndef ARM_MATH_BIG_ENDIAN
  265. x3 = __PKHBT(x2, x1, 0);
  266. #else
  267. x3 = __PKHBT(x1, x2, 0);
  268. #endif
  269. /* multiply and accumlate */
  270. acc1 = __SMLADX(x3, y1, acc1);
  271. /* Read next two samples from scratch1 buffer */
  272. x1 = *__SIMD32(pScr1)++;
  273. /* pack input data */
  274. #ifndef ARM_MATH_BIG_ENDIAN
  275. x3 = __PKHBT(x1, x2, 0);
  276. #else
  277. x3 = __PKHBT(x2, x1, 0);
  278. #endif
  279. acc3 = __SMLADX(x3, y1, acc3);
  280. /* Read four samples from smaller buffer */
  281. y1 = _SIMD32_OFFSET(pScr2 + 2u);
  282. acc0 = __SMLAD(x2, y1, acc0);
  283. acc2 = __SMLAD(x1, y1, acc2);
  284. acc1 = __SMLADX(x3, y1, acc1);
  285. x2 = *__SIMD32(pScr1)++;
  286. #ifndef ARM_MATH_BIG_ENDIAN
  287. x3 = __PKHBT(x2, x1, 0);
  288. #else
  289. x3 = __PKHBT(x1, x2, 0);
  290. #endif
  291. acc3 = __SMLADX(x3, y1, acc3);
  292. pScr2 += 4u;
  293. /* Decrement the loop counter */
  294. tapCnt--;
  295. }
  296. /* Update scratch pointer for remaining samples of smaller length sequence */
  297. pScr1 -= 4u;
  298. /* apply same above for remaining samples of smaller length sequence */
  299. tapCnt = (srcBLen) & 3u;
  300. while(tapCnt > 0u)
  301. {
  302. /* accumlate the results */
  303. acc0 += (*pScr1++ * *pScr2);
  304. acc1 += (*pScr1++ * *pScr2);
  305. acc2 += (*pScr1++ * *pScr2);
  306. acc3 += (*pScr1++ * *pScr2++);
  307. pScr1 -= 3u;
  308. /* Decrement the loop counter */
  309. tapCnt--;
  310. }
  311. blkCnt--;
  312. /* Store the result in the accumulator in the destination buffer. */
  313. *pOut = (q7_t) (__SSAT(acc0 >> 7u, 8));
  314. pOut += inc;
  315. *pOut = (q7_t) (__SSAT(acc1 >> 7u, 8));
  316. pOut += inc;
  317. *pOut = (q7_t) (__SSAT(acc2 >> 7u, 8));
  318. pOut += inc;
  319. *pOut = (q7_t) (__SSAT(acc3 >> 7u, 8));
  320. pOut += inc;
  321. /* Initialization of inputB pointer */
  322. pScr2 = py;
  323. pScratch1 += 4u;
  324. }
  325. blkCnt = (srcALen + srcBLen - 1u) & 0x3;
  326. /* Calculate correlation for remaining samples of Bigger length sequence */
  327. while(blkCnt > 0)
  328. {
  329. /* Initialze temporary scratch pointer as scratch1 */
  330. pScr1 = pScratch1;
  331. /* Clear Accumlators */
  332. acc0 = 0;
  333. tapCnt = (srcBLen) >> 1u;
  334. while(tapCnt > 0u)
  335. {
  336. acc0 += (*pScr1++ * *pScr2++);
  337. acc0 += (*pScr1++ * *pScr2++);
  338. /* Decrement the loop counter */
  339. tapCnt--;
  340. }
  341. tapCnt = (srcBLen) & 1u;
  342. /* apply same above for remaining samples of smaller length sequence */
  343. while(tapCnt > 0u)
  344. {
  345. /* accumlate the results */
  346. acc0 += (*pScr1++ * *pScr2++);
  347. /* Decrement the loop counter */
  348. tapCnt--;
  349. }
  350. blkCnt--;
  351. /* Store the result in the accumulator in the destination buffer. */
  352. *pOut = (q7_t) (__SSAT(acc0 >> 7u, 8));
  353. pOut += inc;
  354. /* Initialization of inputB pointer */
  355. pScr2 = py;
  356. pScratch1 += 1u;
  357. }
  358. }
  359. /**
  360. * @} end of Corr group
  361. */