Keyboard firmwares for Atmel AVR and Cortex-M
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

arm_correlate_opt_q15.c 13KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513
  1. /* ----------------------------------------------------------------------
  2. * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
  3. *
  4. * $Date: 17. January 2013
  5. * $Revision: V1.4.1
  6. *
  7. * Project: CMSIS DSP Library
  8. * Title: arm_correlate_opt_q15.c
  9. *
  10. * Description: Correlation of Q15 sequences.
  11. *
  12. * Target Processor: Cortex-M4/Cortex-M3
  13. *
  14. * Redistribution and use in source and binary forms, with or without
  15. * modification, are permitted provided that the following conditions
  16. * are met:
  17. * - Redistributions of source code must retain the above copyright
  18. * notice, this list of conditions and the following disclaimer.
  19. * - Redistributions in binary form must reproduce the above copyright
  20. * notice, this list of conditions and the following disclaimer in
  21. * the documentation and/or other materials provided with the
  22. * distribution.
  23. * - Neither the name of ARM LIMITED nor the names of its contributors
  24. * may be used to endorse or promote products derived from this
  25. * software without specific prior written permission.
  26. *
  27. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  30. * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  31. * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  32. * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  33. * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  34. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  35. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  36. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  37. * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38. * POSSIBILITY OF SUCH DAMAGE.
  39. * -------------------------------------------------------------------- */
  40. #include "arm_math.h"
  41. /**
  42. * @ingroup groupFilters
  43. */
  44. /**
  45. * @addtogroup Corr
  46. * @{
  47. */
  48. /**
  49. * @brief Correlation of Q15 sequences.
  50. * @param[in] *pSrcA points to the first input sequence.
  51. * @param[in] srcALen length of the first input sequence.
  52. * @param[in] *pSrcB points to the second input sequence.
  53. * @param[in] srcBLen length of the second input sequence.
  54. * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
  55. * @param[in] *pScratch points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
  56. * @return none.
  57. *
  58. * \par Restrictions
  59. * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
  60. * In this case input, output, scratch buffers should be aligned by 32-bit
  61. *
  62. * @details
  63. * <b>Scaling and Overflow Behavior:</b>
  64. *
  65. * \par
  66. * The function is implemented using a 64-bit internal accumulator.
  67. * Both inputs are in 1.15 format and multiplications yield a 2.30 result.
  68. * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
  69. * This approach provides 33 guard bits and there is no risk of overflow.
  70. * The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
  71. *
  72. * \par
  73. * Refer to <code>arm_correlate_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
  74. *
  75. *
  76. */
  77. void arm_correlate_opt_q15(
  78. q15_t * pSrcA,
  79. uint32_t srcALen,
  80. q15_t * pSrcB,
  81. uint32_t srcBLen,
  82. q15_t * pDst,
  83. q15_t * pScratch)
  84. {
  85. q15_t *pIn1; /* inputA pointer */
  86. q15_t *pIn2; /* inputB pointer */
  87. q63_t acc0, acc1, acc2, acc3; /* Accumulators */
  88. q15_t *py; /* Intermediate inputB pointer */
  89. q31_t x1, x2, x3; /* temporary variables for holding input1 and input2 values */
  90. uint32_t j, blkCnt, outBlockSize; /* loop counter */
  91. int32_t inc = 1; /* output pointer increment */
  92. uint32_t tapCnt;
  93. q31_t y1, y2;
  94. q15_t *pScr; /* Intermediate pointers */
  95. q15_t *pOut = pDst; /* output pointer */
  96. #ifdef UNALIGNED_SUPPORT_DISABLE
  97. q15_t a, b;
  98. #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
  99. /* The algorithm implementation is based on the lengths of the inputs. */
  100. /* srcB is always made to slide across srcA. */
  101. /* So srcBLen is always considered as shorter or equal to srcALen */
  102. /* But CORR(x, y) is reverse of CORR(y, x) */
  103. /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
  104. /* and the destination pointer modifier, inc is set to -1 */
  105. /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
  106. /* But to improve the performance,
  107. * we include zeroes in the output instead of zero padding either of the the inputs*/
  108. /* If srcALen > srcBLen,
  109. * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
  110. /* If srcALen < srcBLen,
  111. * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
  112. if(srcALen >= srcBLen)
  113. {
  114. /* Initialization of inputA pointer */
  115. pIn1 = (pSrcA);
  116. /* Initialization of inputB pointer */
  117. pIn2 = (pSrcB);
  118. /* Number of output samples is calculated */
  119. outBlockSize = (2u * srcALen) - 1u;
  120. /* When srcALen > srcBLen, zero padding is done to srcB
  121. * to make their lengths equal.
  122. * Instead, (outBlockSize - (srcALen + srcBLen - 1))
  123. * number of output samples are made zero */
  124. j = outBlockSize - (srcALen + (srcBLen - 1u));
  125. /* Updating the pointer position to non zero value */
  126. pOut += j;
  127. }
  128. else
  129. {
  130. /* Initialization of inputA pointer */
  131. pIn1 = (pSrcB);
  132. /* Initialization of inputB pointer */
  133. pIn2 = (pSrcA);
  134. /* srcBLen is always considered as shorter or equal to srcALen */
  135. j = srcBLen;
  136. srcBLen = srcALen;
  137. srcALen = j;
  138. /* CORR(x, y) = Reverse order(CORR(y, x)) */
  139. /* Hence set the destination pointer to point to the last output sample */
  140. pOut = pDst + ((srcALen + srcBLen) - 2u);
  141. /* Destination address modifier is set to -1 */
  142. inc = -1;
  143. }
  144. pScr = pScratch;
  145. /* Fill (srcBLen - 1u) zeros in scratch buffer */
  146. arm_fill_q15(0, pScr, (srcBLen - 1u));
  147. /* Update temporary scratch pointer */
  148. pScr += (srcBLen - 1u);
  149. #ifndef UNALIGNED_SUPPORT_DISABLE
  150. /* Copy (srcALen) samples in scratch buffer */
  151. arm_copy_q15(pIn1, pScr, srcALen);
  152. /* Update pointers */
  153. //pIn1 += srcALen;
  154. pScr += srcALen;
  155. #else
  156. /* Apply loop unrolling and do 4 Copies simultaneously. */
  157. j = srcALen >> 2u;
  158. /* First part of the processing with loop unrolling copies 4 data points at a time.
  159. ** a second loop below copies for the remaining 1 to 3 samples. */
  160. while(j > 0u)
  161. {
  162. /* copy second buffer in reversal manner */
  163. *pScr++ = *pIn1++;
  164. *pScr++ = *pIn1++;
  165. *pScr++ = *pIn1++;
  166. *pScr++ = *pIn1++;
  167. /* Decrement the loop counter */
  168. j--;
  169. }
  170. /* If the count is not a multiple of 4, copy remaining samples here.
  171. ** No loop unrolling is used. */
  172. j = srcALen % 0x4u;
  173. while(j > 0u)
  174. {
  175. /* copy second buffer in reversal manner for remaining samples */
  176. *pScr++ = *pIn1++;
  177. /* Decrement the loop counter */
  178. j--;
  179. }
  180. #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
  181. #ifndef UNALIGNED_SUPPORT_DISABLE
  182. /* Fill (srcBLen - 1u) zeros at end of scratch buffer */
  183. arm_fill_q15(0, pScr, (srcBLen - 1u));
  184. /* Update pointer */
  185. pScr += (srcBLen - 1u);
  186. #else
  187. /* Apply loop unrolling and do 4 Copies simultaneously. */
  188. j = (srcBLen - 1u) >> 2u;
  189. /* First part of the processing with loop unrolling copies 4 data points at a time.
  190. ** a second loop below copies for the remaining 1 to 3 samples. */
  191. while(j > 0u)
  192. {
  193. /* copy second buffer in reversal manner */
  194. *pScr++ = 0;
  195. *pScr++ = 0;
  196. *pScr++ = 0;
  197. *pScr++ = 0;
  198. /* Decrement the loop counter */
  199. j--;
  200. }
  201. /* If the count is not a multiple of 4, copy remaining samples here.
  202. ** No loop unrolling is used. */
  203. j = (srcBLen - 1u) % 0x4u;
  204. while(j > 0u)
  205. {
  206. /* copy second buffer in reversal manner for remaining samples */
  207. *pScr++ = 0;
  208. /* Decrement the loop counter */
  209. j--;
  210. }
  211. #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
  212. /* Temporary pointer for scratch2 */
  213. py = pIn2;
  214. /* Actual correlation process starts here */
  215. blkCnt = (srcALen + srcBLen - 1u) >> 2;
  216. while(blkCnt > 0)
  217. {
  218. /* Initialze temporary scratch pointer as scratch1 */
  219. pScr = pScratch;
  220. /* Clear Accumlators */
  221. acc0 = 0;
  222. acc1 = 0;
  223. acc2 = 0;
  224. acc3 = 0;
  225. /* Read four samples from scratch1 buffer */
  226. x1 = *__SIMD32(pScr)++;
  227. /* Read next four samples from scratch1 buffer */
  228. x2 = *__SIMD32(pScr)++;
  229. tapCnt = (srcBLen) >> 2u;
  230. while(tapCnt > 0u)
  231. {
  232. #ifndef UNALIGNED_SUPPORT_DISABLE
  233. /* Read four samples from smaller buffer */
  234. y1 = _SIMD32_OFFSET(pIn2);
  235. y2 = _SIMD32_OFFSET(pIn2 + 2u);
  236. acc0 = __SMLALD(x1, y1, acc0);
  237. acc2 = __SMLALD(x2, y1, acc2);
  238. #ifndef ARM_MATH_BIG_ENDIAN
  239. x3 = __PKHBT(x2, x1, 0);
  240. #else
  241. x3 = __PKHBT(x1, x2, 0);
  242. #endif
  243. acc1 = __SMLALDX(x3, y1, acc1);
  244. x1 = _SIMD32_OFFSET(pScr);
  245. acc0 = __SMLALD(x2, y2, acc0);
  246. acc2 = __SMLALD(x1, y2, acc2);
  247. #ifndef ARM_MATH_BIG_ENDIAN
  248. x3 = __PKHBT(x1, x2, 0);
  249. #else
  250. x3 = __PKHBT(x2, x1, 0);
  251. #endif
  252. acc3 = __SMLALDX(x3, y1, acc3);
  253. acc1 = __SMLALDX(x3, y2, acc1);
  254. x2 = _SIMD32_OFFSET(pScr + 2u);
  255. #ifndef ARM_MATH_BIG_ENDIAN
  256. x3 = __PKHBT(x2, x1, 0);
  257. #else
  258. x3 = __PKHBT(x1, x2, 0);
  259. #endif
  260. acc3 = __SMLALDX(x3, y2, acc3);
  261. #else
  262. /* Read four samples from smaller buffer */
  263. a = *pIn2;
  264. b = *(pIn2 + 1);
  265. #ifndef ARM_MATH_BIG_ENDIAN
  266. y1 = __PKHBT(a, b, 16);
  267. #else
  268. y1 = __PKHBT(b, a, 16);
  269. #endif
  270. a = *(pIn2 + 2);
  271. b = *(pIn2 + 3);
  272. #ifndef ARM_MATH_BIG_ENDIAN
  273. y2 = __PKHBT(a, b, 16);
  274. #else
  275. y2 = __PKHBT(b, a, 16);
  276. #endif
  277. acc0 = __SMLALD(x1, y1, acc0);
  278. acc2 = __SMLALD(x2, y1, acc2);
  279. #ifndef ARM_MATH_BIG_ENDIAN
  280. x3 = __PKHBT(x2, x1, 0);
  281. #else
  282. x3 = __PKHBT(x1, x2, 0);
  283. #endif
  284. acc1 = __SMLALDX(x3, y1, acc1);
  285. a = *pScr;
  286. b = *(pScr + 1);
  287. #ifndef ARM_MATH_BIG_ENDIAN
  288. x1 = __PKHBT(a, b, 16);
  289. #else
  290. x1 = __PKHBT(b, a, 16);
  291. #endif
  292. acc0 = __SMLALD(x2, y2, acc0);
  293. acc2 = __SMLALD(x1, y2, acc2);
  294. #ifndef ARM_MATH_BIG_ENDIAN
  295. x3 = __PKHBT(x1, x2, 0);
  296. #else
  297. x3 = __PKHBT(x2, x1, 0);
  298. #endif
  299. acc3 = __SMLALDX(x3, y1, acc3);
  300. acc1 = __SMLALDX(x3, y2, acc1);
  301. a = *(pScr + 2);
  302. b = *(pScr + 3);
  303. #ifndef ARM_MATH_BIG_ENDIAN
  304. x2 = __PKHBT(a, b, 16);
  305. #else
  306. x2 = __PKHBT(b, a, 16);
  307. #endif
  308. #ifndef ARM_MATH_BIG_ENDIAN
  309. x3 = __PKHBT(x2, x1, 0);
  310. #else
  311. x3 = __PKHBT(x1, x2, 0);
  312. #endif
  313. acc3 = __SMLALDX(x3, y2, acc3);
  314. #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
  315. pIn2 += 4u;
  316. pScr += 4u;
  317. /* Decrement the loop counter */
  318. tapCnt--;
  319. }
  320. /* Update scratch pointer for remaining samples of smaller length sequence */
  321. pScr -= 4u;
  322. /* apply same above for remaining samples of smaller length sequence */
  323. tapCnt = (srcBLen) & 3u;
  324. while(tapCnt > 0u)
  325. {
  326. /* accumlate the results */
  327. acc0 += (*pScr++ * *pIn2);
  328. acc1 += (*pScr++ * *pIn2);
  329. acc2 += (*pScr++ * *pIn2);
  330. acc3 += (*pScr++ * *pIn2++);
  331. pScr -= 3u;
  332. /* Decrement the loop counter */
  333. tapCnt--;
  334. }
  335. blkCnt--;
  336. /* Store the results in the accumulators in the destination buffer. */
  337. *pOut = (__SSAT(acc0 >> 15u, 16));
  338. pOut += inc;
  339. *pOut = (__SSAT(acc1 >> 15u, 16));
  340. pOut += inc;
  341. *pOut = (__SSAT(acc2 >> 15u, 16));
  342. pOut += inc;
  343. *pOut = (__SSAT(acc3 >> 15u, 16));
  344. pOut += inc;
  345. /* Initialization of inputB pointer */
  346. pIn2 = py;
  347. pScratch += 4u;
  348. }
  349. blkCnt = (srcALen + srcBLen - 1u) & 0x3;
  350. /* Calculate correlation for remaining samples of Bigger length sequence */
  351. while(blkCnt > 0)
  352. {
  353. /* Initialze temporary scratch pointer as scratch1 */
  354. pScr = pScratch;
  355. /* Clear Accumlators */
  356. acc0 = 0;
  357. tapCnt = (srcBLen) >> 1u;
  358. while(tapCnt > 0u)
  359. {
  360. acc0 += (*pScr++ * *pIn2++);
  361. acc0 += (*pScr++ * *pIn2++);
  362. /* Decrement the loop counter */
  363. tapCnt--;
  364. }
  365. tapCnt = (srcBLen) & 1u;
  366. /* apply same above for remaining samples of smaller length sequence */
  367. while(tapCnt > 0u)
  368. {
  369. /* accumlate the results */
  370. acc0 += (*pScr++ * *pIn2++);
  371. /* Decrement the loop counter */
  372. tapCnt--;
  373. }
  374. blkCnt--;
  375. /* Store the result in the accumulator in the destination buffer. */
  376. *pOut = (q15_t) (__SSAT((acc0 >> 15), 16));
  377. pOut += inc;
  378. /* Initialization of inputB pointer */
  379. pIn2 = py;
  380. pScratch += 1u;
  381. }
  382. }
  383. /**
  384. * @} end of Corr group
  385. */