Keyboard firmwares for Atmel AVR and Cortex-M
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

arm_correlate_q7.c 24KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790
  1. /* ----------------------------------------------------------------------
  2. * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
  3. *
  4. * $Date: 17. January 2013
  5. * $Revision: V1.4.1
  6. *
  7. * Project: CMSIS DSP Library
  8. * Title: arm_correlate_q7.c
  9. *
  10. * Description: Correlation of Q7 sequences.
  11. *
  12. * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
  13. *
  14. * Redistribution and use in source and binary forms, with or without
  15. * modification, are permitted provided that the following conditions
  16. * are met:
  17. * - Redistributions of source code must retain the above copyright
  18. * notice, this list of conditions and the following disclaimer.
  19. * - Redistributions in binary form must reproduce the above copyright
  20. * notice, this list of conditions and the following disclaimer in
  21. * the documentation and/or other materials provided with the
  22. * distribution.
  23. * - Neither the name of ARM LIMITED nor the names of its contributors
  24. * may be used to endorse or promote products derived from this
  25. * software without specific prior written permission.
  26. *
  27. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  30. * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  31. * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  32. * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  33. * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  34. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  35. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  36. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  37. * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38. * POSSIBILITY OF SUCH DAMAGE.
  39. * -------------------------------------------------------------------- */
  40. #include "arm_math.h"
  41. /**
  42. * @ingroup groupFilters
  43. */
  44. /**
  45. * @addtogroup Corr
  46. * @{
  47. */
  48. /**
  49. * @brief Correlation of Q7 sequences.
  50. * @param[in] *pSrcA points to the first input sequence.
  51. * @param[in] srcALen length of the first input sequence.
  52. * @param[in] *pSrcB points to the second input sequence.
  53. * @param[in] srcBLen length of the second input sequence.
  54. * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
  55. * @return none.
  56. *
  57. * @details
  58. * <b>Scaling and Overflow Behavior:</b>
  59. *
  60. * \par
  61. * The function is implemented using a 32-bit internal accumulator.
  62. * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.
  63. * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
  64. * This approach provides 17 guard bits and there is no risk of overflow as long as <code>max(srcALen, srcBLen)<131072</code>.
  65. * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and saturated to 1.7 format.
  66. *
  67. * \par
  68. * Refer the function <code>arm_correlate_opt_q7()</code> for a faster implementation of this function.
  69. *
  70. */
  71. void arm_correlate_q7(
  72. q7_t * pSrcA,
  73. uint32_t srcALen,
  74. q7_t * pSrcB,
  75. uint32_t srcBLen,
  76. q7_t * pDst)
  77. {
  78. #ifndef ARM_MATH_CM0_FAMILY
  79. /* Run the below code for Cortex-M4 and Cortex-M3 */
  80. q7_t *pIn1; /* inputA pointer */
  81. q7_t *pIn2; /* inputB pointer */
  82. q7_t *pOut = pDst; /* output pointer */
  83. q7_t *px; /* Intermediate inputA pointer */
  84. q7_t *py; /* Intermediate inputB pointer */
  85. q7_t *pSrc1; /* Intermediate pointers */
  86. q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
  87. q31_t input1, input2; /* temporary variables */
  88. q15_t in1, in2; /* temporary variables */
  89. q7_t x0, x1, x2, x3, c0, c1; /* temporary variables for holding input and coefficient values */
  90. uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */
  91. int32_t inc = 1;
  92. /* The algorithm implementation is based on the lengths of the inputs. */
  93. /* srcB is always made to slide across srcA. */
  94. /* So srcBLen is always considered as shorter or equal to srcALen */
  95. /* But CORR(x, y) is reverse of CORR(y, x) */
  96. /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
  97. /* and the destination pointer modifier, inc is set to -1 */
  98. /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
  99. /* But to improve the performance,
  100. * we include zeroes in the output instead of zero padding either of the the inputs*/
  101. /* If srcALen > srcBLen,
  102. * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
  103. /* If srcALen < srcBLen,
  104. * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
  105. if(srcALen >= srcBLen)
  106. {
  107. /* Initialization of inputA pointer */
  108. pIn1 = (pSrcA);
  109. /* Initialization of inputB pointer */
  110. pIn2 = (pSrcB);
  111. /* Number of output samples is calculated */
  112. outBlockSize = (2u * srcALen) - 1u;
  113. /* When srcALen > srcBLen, zero padding is done to srcB
  114. * to make their lengths equal.
  115. * Instead, (outBlockSize - (srcALen + srcBLen - 1))
  116. * number of output samples are made zero */
  117. j = outBlockSize - (srcALen + (srcBLen - 1u));
  118. /* Updating the pointer position to non zero value */
  119. pOut += j;
  120. }
  121. else
  122. {
  123. /* Initialization of inputA pointer */
  124. pIn1 = (pSrcB);
  125. /* Initialization of inputB pointer */
  126. pIn2 = (pSrcA);
  127. /* srcBLen is always considered as shorter or equal to srcALen */
  128. j = srcBLen;
  129. srcBLen = srcALen;
  130. srcALen = j;
  131. /* CORR(x, y) = Reverse order(CORR(y, x)) */
  132. /* Hence set the destination pointer to point to the last output sample */
  133. pOut = pDst + ((srcALen + srcBLen) - 2u);
  134. /* Destination address modifier is set to -1 */
  135. inc = -1;
  136. }
  137. /* The function is internally
  138. * divided into three parts according to the number of multiplications that has to be
  139. * taken place between inputA samples and inputB samples. In the first part of the
  140. * algorithm, the multiplications increase by one for every iteration.
  141. * In the second part of the algorithm, srcBLen number of multiplications are done.
  142. * In the third part of the algorithm, the multiplications decrease by one
  143. * for every iteration.*/
  144. /* The algorithm is implemented in three stages.
  145. * The loop counters of each stage is initiated here. */
  146. blockSize1 = srcBLen - 1u;
  147. blockSize2 = srcALen - (srcBLen - 1u);
  148. blockSize3 = blockSize1;
  149. /* --------------------------
  150. * Initializations of stage1
  151. * -------------------------*/
  152. /* sum = x[0] * y[srcBlen - 1]
  153. * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
  154. * ....
  155. * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
  156. */
  157. /* In this stage the MAC operations are increased by 1 for every iteration.
  158. The count variable holds the number of MAC operations performed */
  159. count = 1u;
  160. /* Working pointer of inputA */
  161. px = pIn1;
  162. /* Working pointer of inputB */
  163. pSrc1 = pIn2 + (srcBLen - 1u);
  164. py = pSrc1;
  165. /* ------------------------
  166. * Stage1 process
  167. * ----------------------*/
  168. /* The first stage starts here */
  169. while(blockSize1 > 0u)
  170. {
  171. /* Accumulator is made zero for every iteration */
  172. sum = 0;
  173. /* Apply loop unrolling and compute 4 MACs simultaneously. */
  174. k = count >> 2;
  175. /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
  176. ** a second loop below computes MACs for the remaining 1 to 3 samples. */
  177. while(k > 0u)
  178. {
  179. /* x[0] , x[1] */
  180. in1 = (q15_t) * px++;
  181. in2 = (q15_t) * px++;
  182. input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
  183. /* y[srcBLen - 4] , y[srcBLen - 3] */
  184. in1 = (q15_t) * py++;
  185. in2 = (q15_t) * py++;
  186. input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
  187. /* x[0] * y[srcBLen - 4] */
  188. /* x[1] * y[srcBLen - 3] */
  189. sum = __SMLAD(input1, input2, sum);
  190. /* x[2] , x[3] */
  191. in1 = (q15_t) * px++;
  192. in2 = (q15_t) * px++;
  193. input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
  194. /* y[srcBLen - 2] , y[srcBLen - 1] */
  195. in1 = (q15_t) * py++;
  196. in2 = (q15_t) * py++;
  197. input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
  198. /* x[2] * y[srcBLen - 2] */
  199. /* x[3] * y[srcBLen - 1] */
  200. sum = __SMLAD(input1, input2, sum);
  201. /* Decrement the loop counter */
  202. k--;
  203. }
  204. /* If the count is not a multiple of 4, compute any remaining MACs here.
  205. ** No loop unrolling is used. */
  206. k = count % 0x4u;
  207. while(k > 0u)
  208. {
  209. /* Perform the multiply-accumulates */
  210. /* x[0] * y[srcBLen - 1] */
  211. sum += (q31_t) ((q15_t) * px++ * *py++);
  212. /* Decrement the loop counter */
  213. k--;
  214. }
  215. /* Store the result in the accumulator in the destination buffer. */
  216. *pOut = (q7_t) (__SSAT(sum >> 7, 8));
  217. /* Destination pointer is updated according to the address modifier, inc */
  218. pOut += inc;
  219. /* Update the inputA and inputB pointers for next MAC calculation */
  220. py = pSrc1 - count;
  221. px = pIn1;
  222. /* Increment the MAC count */
  223. count++;
  224. /* Decrement the loop counter */
  225. blockSize1--;
  226. }
  227. /* --------------------------
  228. * Initializations of stage2
  229. * ------------------------*/
  230. /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
  231. * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
  232. * ....
  233. * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
  234. */
  235. /* Working pointer of inputA */
  236. px = pIn1;
  237. /* Working pointer of inputB */
  238. py = pIn2;
  239. /* count is index by which the pointer pIn1 to be incremented */
  240. count = 0u;
  241. /* -------------------
  242. * Stage2 process
  243. * ------------------*/
  244. /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
  245. * So, to loop unroll over blockSize2,
  246. * srcBLen should be greater than or equal to 4 */
  247. if(srcBLen >= 4u)
  248. {
  249. /* Loop unroll over blockSize2, by 4 */
  250. blkCnt = blockSize2 >> 2u;
  251. while(blkCnt > 0u)
  252. {
  253. /* Set all accumulators to zero */
  254. acc0 = 0;
  255. acc1 = 0;
  256. acc2 = 0;
  257. acc3 = 0;
  258. /* read x[0], x[1], x[2] samples */
  259. x0 = *px++;
  260. x1 = *px++;
  261. x2 = *px++;
  262. /* Apply loop unrolling and compute 4 MACs simultaneously. */
  263. k = srcBLen >> 2u;
  264. /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
  265. ** a second loop below computes MACs for the remaining 1 to 3 samples. */
  266. do
  267. {
  268. /* Read y[0] sample */
  269. c0 = *py++;
  270. /* Read y[1] sample */
  271. c1 = *py++;
  272. /* Read x[3] sample */
  273. x3 = *px++;
  274. /* x[0] and x[1] are packed */
  275. in1 = (q15_t) x0;
  276. in2 = (q15_t) x1;
  277. input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
  278. /* y[0] and y[1] are packed */
  279. in1 = (q15_t) c0;
  280. in2 = (q15_t) c1;
  281. input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
  282. /* acc0 += x[0] * y[0] + x[1] * y[1] */
  283. acc0 = __SMLAD(input1, input2, acc0);
  284. /* x[1] and x[2] are packed */
  285. in1 = (q15_t) x1;
  286. in2 = (q15_t) x2;
  287. input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
  288. /* acc1 += x[1] * y[0] + x[2] * y[1] */
  289. acc1 = __SMLAD(input1, input2, acc1);
  290. /* x[2] and x[3] are packed */
  291. in1 = (q15_t) x2;
  292. in2 = (q15_t) x3;
  293. input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
  294. /* acc2 += x[2] * y[0] + x[3] * y[1] */
  295. acc2 = __SMLAD(input1, input2, acc2);
  296. /* Read x[4] sample */
  297. x0 = *(px++);
  298. /* x[3] and x[4] are packed */
  299. in1 = (q15_t) x3;
  300. in2 = (q15_t) x0;
  301. input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
  302. /* acc3 += x[3] * y[0] + x[4] * y[1] */
  303. acc3 = __SMLAD(input1, input2, acc3);
  304. /* Read y[2] sample */
  305. c0 = *py++;
  306. /* Read y[3] sample */
  307. c1 = *py++;
  308. /* Read x[5] sample */
  309. x1 = *px++;
  310. /* x[2] and x[3] are packed */
  311. in1 = (q15_t) x2;
  312. in2 = (q15_t) x3;
  313. input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
  314. /* y[2] and y[3] are packed */
  315. in1 = (q15_t) c0;
  316. in2 = (q15_t) c1;
  317. input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
  318. /* acc0 += x[2] * y[2] + x[3] * y[3] */
  319. acc0 = __SMLAD(input1, input2, acc0);
  320. /* x[3] and x[4] are packed */
  321. in1 = (q15_t) x3;
  322. in2 = (q15_t) x0;
  323. input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
  324. /* acc1 += x[3] * y[2] + x[4] * y[3] */
  325. acc1 = __SMLAD(input1, input2, acc1);
  326. /* x[4] and x[5] are packed */
  327. in1 = (q15_t) x0;
  328. in2 = (q15_t) x1;
  329. input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
  330. /* acc2 += x[4] * y[2] + x[5] * y[3] */
  331. acc2 = __SMLAD(input1, input2, acc2);
  332. /* Read x[6] sample */
  333. x2 = *px++;
  334. /* x[5] and x[6] are packed */
  335. in1 = (q15_t) x1;
  336. in2 = (q15_t) x2;
  337. input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
  338. /* acc3 += x[5] * y[2] + x[6] * y[3] */
  339. acc3 = __SMLAD(input1, input2, acc3);
  340. } while(--k);
  341. /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
  342. ** No loop unrolling is used. */
  343. k = srcBLen % 0x4u;
  344. while(k > 0u)
  345. {
  346. /* Read y[4] sample */
  347. c0 = *py++;
  348. /* Read x[7] sample */
  349. x3 = *px++;
  350. /* Perform the multiply-accumulates */
  351. /* acc0 += x[4] * y[4] */
  352. acc0 += ((q15_t) x0 * c0);
  353. /* acc1 += x[5] * y[4] */
  354. acc1 += ((q15_t) x1 * c0);
  355. /* acc2 += x[6] * y[4] */
  356. acc2 += ((q15_t) x2 * c0);
  357. /* acc3 += x[7] * y[4] */
  358. acc3 += ((q15_t) x3 * c0);
  359. /* Reuse the present samples for the next MAC */
  360. x0 = x1;
  361. x1 = x2;
  362. x2 = x3;
  363. /* Decrement the loop counter */
  364. k--;
  365. }
  366. /* Store the result in the accumulator in the destination buffer. */
  367. *pOut = (q7_t) (__SSAT(acc0 >> 7, 8));
  368. /* Destination pointer is updated according to the address modifier, inc */
  369. pOut += inc;
  370. *pOut = (q7_t) (__SSAT(acc1 >> 7, 8));
  371. pOut += inc;
  372. *pOut = (q7_t) (__SSAT(acc2 >> 7, 8));
  373. pOut += inc;
  374. *pOut = (q7_t) (__SSAT(acc3 >> 7, 8));
  375. pOut += inc;
  376. count += 4u;
  377. /* Update the inputA and inputB pointers for next MAC calculation */
  378. px = pIn1 + count;
  379. py = pIn2;
  380. /* Decrement the loop counter */
  381. blkCnt--;
  382. }
  383. /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
  384. ** No loop unrolling is used. */
  385. blkCnt = blockSize2 % 0x4u;
  386. while(blkCnt > 0u)
  387. {
  388. /* Accumulator is made zero for every iteration */
  389. sum = 0;
  390. /* Apply loop unrolling and compute 4 MACs simultaneously. */
  391. k = srcBLen >> 2u;
  392. /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
  393. ** a second loop below computes MACs for the remaining 1 to 3 samples. */
  394. while(k > 0u)
  395. {
  396. /* Reading two inputs of SrcA buffer and packing */
  397. in1 = (q15_t) * px++;
  398. in2 = (q15_t) * px++;
  399. input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
  400. /* Reading two inputs of SrcB buffer and packing */
  401. in1 = (q15_t) * py++;
  402. in2 = (q15_t) * py++;
  403. input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
  404. /* Perform the multiply-accumulates */
  405. sum = __SMLAD(input1, input2, sum);
  406. /* Reading two inputs of SrcA buffer and packing */
  407. in1 = (q15_t) * px++;
  408. in2 = (q15_t) * px++;
  409. input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
  410. /* Reading two inputs of SrcB buffer and packing */
  411. in1 = (q15_t) * py++;
  412. in2 = (q15_t) * py++;
  413. input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
  414. /* Perform the multiply-accumulates */
  415. sum = __SMLAD(input1, input2, sum);
  416. /* Decrement the loop counter */
  417. k--;
  418. }
  419. /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
  420. ** No loop unrolling is used. */
  421. k = srcBLen % 0x4u;
  422. while(k > 0u)
  423. {
  424. /* Perform the multiply-accumulates */
  425. sum += ((q15_t) * px++ * *py++);
  426. /* Decrement the loop counter */
  427. k--;
  428. }
  429. /* Store the result in the accumulator in the destination buffer. */
  430. *pOut = (q7_t) (__SSAT(sum >> 7, 8));
  431. /* Destination pointer is updated according to the address modifier, inc */
  432. pOut += inc;
  433. /* Increment the pointer pIn1 index, count by 1 */
  434. count++;
  435. /* Update the inputA and inputB pointers for next MAC calculation */
  436. px = pIn1 + count;
  437. py = pIn2;
  438. /* Decrement the loop counter */
  439. blkCnt--;
  440. }
  441. }
  442. else
  443. {
  444. /* If the srcBLen is not a multiple of 4,
  445. * the blockSize2 loop cannot be unrolled by 4 */
  446. blkCnt = blockSize2;
  447. while(blkCnt > 0u)
  448. {
  449. /* Accumulator is made zero for every iteration */
  450. sum = 0;
  451. /* Loop over srcBLen */
  452. k = srcBLen;
  453. while(k > 0u)
  454. {
  455. /* Perform the multiply-accumulate */
  456. sum += ((q15_t) * px++ * *py++);
  457. /* Decrement the loop counter */
  458. k--;
  459. }
  460. /* Store the result in the accumulator in the destination buffer. */
  461. *pOut = (q7_t) (__SSAT(sum >> 7, 8));
  462. /* Destination pointer is updated according to the address modifier, inc */
  463. pOut += inc;
  464. /* Increment the MAC count */
  465. count++;
  466. /* Update the inputA and inputB pointers for next MAC calculation */
  467. px = pIn1 + count;
  468. py = pIn2;
  469. /* Decrement the loop counter */
  470. blkCnt--;
  471. }
  472. }
  473. /* --------------------------
  474. * Initializations of stage3
  475. * -------------------------*/
  476. /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
  477. * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
  478. * ....
  479. * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
  480. * sum += x[srcALen-1] * y[0]
  481. */
  482. /* In this stage the MAC operations are decreased by 1 for every iteration.
  483. The count variable holds the number of MAC operations performed */
  484. count = srcBLen - 1u;
  485. /* Working pointer of inputA */
  486. pSrc1 = pIn1 + (srcALen - (srcBLen - 1u));
  487. px = pSrc1;
  488. /* Working pointer of inputB */
  489. py = pIn2;
  490. /* -------------------
  491. * Stage3 process
  492. * ------------------*/
  493. while(blockSize3 > 0u)
  494. {
  495. /* Accumulator is made zero for every iteration */
  496. sum = 0;
  497. /* Apply loop unrolling and compute 4 MACs simultaneously. */
  498. k = count >> 2u;
  499. /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
  500. ** a second loop below computes MACs for the remaining 1 to 3 samples. */
  501. while(k > 0u)
  502. {
  503. /* x[srcALen - srcBLen + 1] , x[srcALen - srcBLen + 2] */
  504. in1 = (q15_t) * px++;
  505. in2 = (q15_t) * px++;
  506. input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
  507. /* y[0] , y[1] */
  508. in1 = (q15_t) * py++;
  509. in2 = (q15_t) * py++;
  510. input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
  511. /* sum += x[srcALen - srcBLen + 1] * y[0] */
  512. /* sum += x[srcALen - srcBLen + 2] * y[1] */
  513. sum = __SMLAD(input1, input2, sum);
  514. /* x[srcALen - srcBLen + 3] , x[srcALen - srcBLen + 4] */
  515. in1 = (q15_t) * px++;
  516. in2 = (q15_t) * px++;
  517. input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
  518. /* y[2] , y[3] */
  519. in1 = (q15_t) * py++;
  520. in2 = (q15_t) * py++;
  521. input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
  522. /* sum += x[srcALen - srcBLen + 3] * y[2] */
  523. /* sum += x[srcALen - srcBLen + 4] * y[3] */
  524. sum = __SMLAD(input1, input2, sum);
  525. /* Decrement the loop counter */
  526. k--;
  527. }
  528. /* If the count is not a multiple of 4, compute any remaining MACs here.
  529. ** No loop unrolling is used. */
  530. k = count % 0x4u;
  531. while(k > 0u)
  532. {
  533. /* Perform the multiply-accumulates */
  534. sum += ((q15_t) * px++ * *py++);
  535. /* Decrement the loop counter */
  536. k--;
  537. }
  538. /* Store the result in the accumulator in the destination buffer. */
  539. *pOut = (q7_t) (__SSAT(sum >> 7, 8));
  540. /* Destination pointer is updated according to the address modifier, inc */
  541. pOut += inc;
  542. /* Update the inputA and inputB pointers for next MAC calculation */
  543. px = ++pSrc1;
  544. py = pIn2;
  545. /* Decrement the MAC count */
  546. count--;
  547. /* Decrement the loop counter */
  548. blockSize3--;
  549. }
  550. #else
  551. /* Run the below code for Cortex-M0 */
  552. q7_t *pIn1 = pSrcA; /* inputA pointer */
  553. q7_t *pIn2 = pSrcB + (srcBLen - 1u); /* inputB pointer */
  554. q31_t sum; /* Accumulator */
  555. uint32_t i = 0u, j; /* loop counters */
  556. uint32_t inv = 0u; /* Reverse order flag */
  557. uint32_t tot = 0u; /* Length */
  558. /* The algorithm implementation is based on the lengths of the inputs. */
  559. /* srcB is always made to slide across srcA. */
  560. /* So srcBLen is always considered as shorter or equal to srcALen */
  561. /* But CORR(x, y) is reverse of CORR(y, x) */
  562. /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
  563. /* and a varaible, inv is set to 1 */
  564. /* If lengths are not equal then zero pad has to be done to make the two
  565. * inputs of same length. But to improve the performance, we include zeroes
  566. * in the output instead of zero padding either of the the inputs*/
  567. /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the
  568. * starting of the output buffer */
  569. /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the
  570. * ending of the output buffer */
  571. /* Once the zero padding is done the remaining of the output is calcualted
  572. * using convolution but with the shorter signal time shifted. */
  573. /* Calculate the length of the remaining sequence */
  574. tot = ((srcALen + srcBLen) - 2u);
  575. if(srcALen > srcBLen)
  576. {
  577. /* Calculating the number of zeros to be padded to the output */
  578. j = srcALen - srcBLen;
  579. /* Initialise the pointer after zero padding */
  580. pDst += j;
  581. }
  582. else if(srcALen < srcBLen)
  583. {
  584. /* Initialization to inputB pointer */
  585. pIn1 = pSrcB;
  586. /* Initialization to the end of inputA pointer */
  587. pIn2 = pSrcA + (srcALen - 1u);
  588. /* Initialisation of the pointer after zero padding */
  589. pDst = pDst + tot;
  590. /* Swapping the lengths */
  591. j = srcALen;
  592. srcALen = srcBLen;
  593. srcBLen = j;
  594. /* Setting the reverse flag */
  595. inv = 1;
  596. }
  597. /* Loop to calculate convolution for output length number of times */
  598. for (i = 0u; i <= tot; i++)
  599. {
  600. /* Initialize sum with zero to carry on MAC operations */
  601. sum = 0;
  602. /* Loop to perform MAC operations according to convolution equation */
  603. for (j = 0u; j <= i; j++)
  604. {
  605. /* Check the array limitations */
  606. if((((i - j) < srcBLen) && (j < srcALen)))
  607. {
  608. /* z[i] += x[i-j] * y[j] */
  609. sum += ((q15_t) pIn1[j] * pIn2[-((int32_t) i - j)]);
  610. }
  611. }
  612. /* Store the output in the destination buffer */
  613. if(inv == 1)
  614. *pDst-- = (q7_t) __SSAT((sum >> 7u), 8u);
  615. else
  616. *pDst++ = (q7_t) __SSAT((sum >> 7u), 8u);
  617. }
  618. #endif /* #ifndef ARM_MATH_CM0_FAMILY */
  619. }
  620. /**
  621. * @} end of Corr group
  622. */