Keyboard firmwares for Atmel AVR and Cortex-M
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

arm_correlate_f32.c 23KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739
  1. /* ----------------------------------------------------------------------------
  2. * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
  3. *
  4. * $Date: 17. January 2013
  5. * $Revision: V1.4.1
  6. *
  7. * Project: CMSIS DSP Library
  8. * Title: arm_correlate_f32.c
  9. *
  10. * Description: Correlation of floating-point sequences.
  11. *
  12. * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
  13. *
  14. * Redistribution and use in source and binary forms, with or without
  15. * modification, are permitted provided that the following conditions
  16. * are met:
  17. * - Redistributions of source code must retain the above copyright
  18. * notice, this list of conditions and the following disclaimer.
  19. * - Redistributions in binary form must reproduce the above copyright
  20. * notice, this list of conditions and the following disclaimer in
  21. * the documentation and/or other materials provided with the
  22. * distribution.
  23. * - Neither the name of ARM LIMITED nor the names of its contributors
  24. * may be used to endorse or promote products derived from this
  25. * software without specific prior written permission.
  26. *
  27. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  30. * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  31. * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  32. * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  33. * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  34. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  35. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  36. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  37. * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38. * POSSIBILITY OF SUCH DAMAGE.
  39. * -------------------------------------------------------------------------- */
  40. #include "arm_math.h"
  41. /**
  42. * @ingroup groupFilters
  43. */
  44. /**
  45. * @defgroup Corr Correlation
  46. *
  47. * Correlation is a mathematical operation that is similar to convolution.
  48. * As with convolution, correlation uses two signals to produce a third signal.
  49. * The underlying algorithms in correlation and convolution are identical except that one of the inputs is flipped in convolution.
  50. * Correlation is commonly used to measure the similarity between two signals.
  51. * It has applications in pattern recognition, cryptanalysis, and searching.
  52. * The CMSIS library provides correlation functions for Q7, Q15, Q31 and floating-point data types.
  53. * Fast versions of the Q15 and Q31 functions are also provided.
  54. *
  55. * \par Algorithm
  56. * Let <code>a[n]</code> and <code>b[n]</code> be sequences of length <code>srcALen</code> and <code>srcBLen</code> samples respectively.
  57. * The convolution of the two signals is denoted by
  58. * <pre>
  59. * c[n] = a[n] * b[n]
  60. * </pre>
  61. * In correlation, one of the signals is flipped in time
  62. * <pre>
  63. * c[n] = a[n] * b[-n]
  64. * </pre>
  65. *
  66. * \par
  67. * and this is mathematically defined as
  68. * \image html CorrelateEquation.gif
  69. * \par
  70. * The <code>pSrcA</code> points to the first input vector of length <code>srcALen</code> and <code>pSrcB</code> points to the second input vector of length <code>srcBLen</code>.
  71. * The result <code>c[n]</code> is of length <code>2 * max(srcALen, srcBLen) - 1</code> and is defined over the interval <code>n=0, 1, 2, ..., (2 * max(srcALen, srcBLen) - 2)</code>.
  72. * The output result is written to <code>pDst</code> and the calling function must allocate <code>2 * max(srcALen, srcBLen) - 1</code> words for the result.
  73. *
  74. * <b>Note</b>
  75. * \par
  76. * The <code>pDst</code> should be initialized to all zeros before being used.
  77. *
  78. * <b>Fixed-Point Behavior</b>
  79. * \par
  80. * Correlation requires summing up a large number of intermediate products.
  81. * As such, the Q7, Q15, and Q31 functions run a risk of overflow and saturation.
  82. * Refer to the function specific documentation below for further details of the particular algorithm used.
  83. *
  84. *
  85. * <b>Fast Versions</b>
  86. *
  87. * \par
  88. * Fast versions are supported for Q31 and Q15. Cycles for Fast versions are less compared to Q31 and Q15 of correlate and the design requires
  89. * the input signals should be scaled down to avoid intermediate overflows.
  90. *
  91. *
  92. * <b>Opt Versions</b>
  93. *
  94. * \par
  95. * Opt versions are supported for Q15 and Q7. Design uses internal scratch buffer for getting good optimisation.
  96. * These versions are optimised in cycles and consumes more memory(Scratch memory) compared to Q15 and Q7 versions of correlate
  97. */
  98. /**
  99. * @addtogroup Corr
  100. * @{
  101. */
  102. /**
  103. * @brief Correlation of floating-point sequences.
  104. * @param[in] *pSrcA points to the first input sequence.
  105. * @param[in] srcALen length of the first input sequence.
  106. * @param[in] *pSrcB points to the second input sequence.
  107. * @param[in] srcBLen length of the second input sequence.
  108. * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
  109. * @return none.
  110. */
  111. void arm_correlate_f32(
  112. float32_t * pSrcA,
  113. uint32_t srcALen,
  114. float32_t * pSrcB,
  115. uint32_t srcBLen,
  116. float32_t * pDst)
  117. {
  118. #ifndef ARM_MATH_CM0_FAMILY
  119. /* Run the below code for Cortex-M4 and Cortex-M3 */
  120. float32_t *pIn1; /* inputA pointer */
  121. float32_t *pIn2; /* inputB pointer */
  122. float32_t *pOut = pDst; /* output pointer */
  123. float32_t *px; /* Intermediate inputA pointer */
  124. float32_t *py; /* Intermediate inputB pointer */
  125. float32_t *pSrc1; /* Intermediate pointers */
  126. float32_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
  127. float32_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */
  128. uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counters */
  129. int32_t inc = 1; /* Destination address modifier */
  130. /* The algorithm implementation is based on the lengths of the inputs. */
  131. /* srcB is always made to slide across srcA. */
  132. /* So srcBLen is always considered as shorter or equal to srcALen */
  133. /* But CORR(x, y) is reverse of CORR(y, x) */
  134. /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
  135. /* and the destination pointer modifier, inc is set to -1 */
  136. /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
  137. /* But to improve the performance,
  138. * we include zeroes in the output instead of zero padding either of the the inputs*/
  139. /* If srcALen > srcBLen,
  140. * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
  141. /* If srcALen < srcBLen,
  142. * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
  143. if(srcALen >= srcBLen)
  144. {
  145. /* Initialization of inputA pointer */
  146. pIn1 = pSrcA;
  147. /* Initialization of inputB pointer */
  148. pIn2 = pSrcB;
  149. /* Number of output samples is calculated */
  150. outBlockSize = (2u * srcALen) - 1u;
  151. /* When srcALen > srcBLen, zero padding has to be done to srcB
  152. * to make their lengths equal.
  153. * Instead, (outBlockSize - (srcALen + srcBLen - 1))
  154. * number of output samples are made zero */
  155. j = outBlockSize - (srcALen + (srcBLen - 1u));
  156. /* Updating the pointer position to non zero value */
  157. pOut += j;
  158. //while(j > 0u)
  159. //{
  160. // /* Zero is stored in the destination buffer */
  161. // *pOut++ = 0.0f;
  162. // /* Decrement the loop counter */
  163. // j--;
  164. //}
  165. }
  166. else
  167. {
  168. /* Initialization of inputA pointer */
  169. pIn1 = pSrcB;
  170. /* Initialization of inputB pointer */
  171. pIn2 = pSrcA;
  172. /* srcBLen is always considered as shorter or equal to srcALen */
  173. j = srcBLen;
  174. srcBLen = srcALen;
  175. srcALen = j;
  176. /* CORR(x, y) = Reverse order(CORR(y, x)) */
  177. /* Hence set the destination pointer to point to the last output sample */
  178. pOut = pDst + ((srcALen + srcBLen) - 2u);
  179. /* Destination address modifier is set to -1 */
  180. inc = -1;
  181. }
  182. /* The function is internally
  183. * divided into three parts according to the number of multiplications that has to be
  184. * taken place between inputA samples and inputB samples. In the first part of the
  185. * algorithm, the multiplications increase by one for every iteration.
  186. * In the second part of the algorithm, srcBLen number of multiplications are done.
  187. * In the third part of the algorithm, the multiplications decrease by one
  188. * for every iteration.*/
  189. /* The algorithm is implemented in three stages.
  190. * The loop counters of each stage is initiated here. */
  191. blockSize1 = srcBLen - 1u;
  192. blockSize2 = srcALen - (srcBLen - 1u);
  193. blockSize3 = blockSize1;
  194. /* --------------------------
  195. * Initializations of stage1
  196. * -------------------------*/
  197. /* sum = x[0] * y[srcBlen - 1]
  198. * sum = x[0] * y[srcBlen-2] + x[1] * y[srcBlen - 1]
  199. * ....
  200. * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
  201. */
  202. /* In this stage the MAC operations are increased by 1 for every iteration.
  203. The count variable holds the number of MAC operations performed */
  204. count = 1u;
  205. /* Working pointer of inputA */
  206. px = pIn1;
  207. /* Working pointer of inputB */
  208. pSrc1 = pIn2 + (srcBLen - 1u);
  209. py = pSrc1;
  210. /* ------------------------
  211. * Stage1 process
  212. * ----------------------*/
  213. /* The first stage starts here */
  214. while(blockSize1 > 0u)
  215. {
  216. /* Accumulator is made zero for every iteration */
  217. sum = 0.0f;
  218. /* Apply loop unrolling and compute 4 MACs simultaneously. */
  219. k = count >> 2u;
  220. /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
  221. ** a second loop below computes MACs for the remaining 1 to 3 samples. */
  222. while(k > 0u)
  223. {
  224. /* x[0] * y[srcBLen - 4] */
  225. sum += *px++ * *py++;
  226. /* x[1] * y[srcBLen - 3] */
  227. sum += *px++ * *py++;
  228. /* x[2] * y[srcBLen - 2] */
  229. sum += *px++ * *py++;
  230. /* x[3] * y[srcBLen - 1] */
  231. sum += *px++ * *py++;
  232. /* Decrement the loop counter */
  233. k--;
  234. }
  235. /* If the count is not a multiple of 4, compute any remaining MACs here.
  236. ** No loop unrolling is used. */
  237. k = count % 0x4u;
  238. while(k > 0u)
  239. {
  240. /* Perform the multiply-accumulate */
  241. /* x[0] * y[srcBLen - 1] */
  242. sum += *px++ * *py++;
  243. /* Decrement the loop counter */
  244. k--;
  245. }
  246. /* Store the result in the accumulator in the destination buffer. */
  247. *pOut = sum;
  248. /* Destination pointer is updated according to the address modifier, inc */
  249. pOut += inc;
  250. /* Update the inputA and inputB pointers for next MAC calculation */
  251. py = pSrc1 - count;
  252. px = pIn1;
  253. /* Increment the MAC count */
  254. count++;
  255. /* Decrement the loop counter */
  256. blockSize1--;
  257. }
  258. /* --------------------------
  259. * Initializations of stage2
  260. * ------------------------*/
  261. /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
  262. * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
  263. * ....
  264. * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
  265. */
  266. /* Working pointer of inputA */
  267. px = pIn1;
  268. /* Working pointer of inputB */
  269. py = pIn2;
  270. /* count is index by which the pointer pIn1 to be incremented */
  271. count = 0u;
  272. /* -------------------
  273. * Stage2 process
  274. * ------------------*/
  275. /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
  276. * So, to loop unroll over blockSize2,
  277. * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */
  278. if(srcBLen >= 4u)
  279. {
  280. /* Loop unroll over blockSize2, by 4 */
  281. blkCnt = blockSize2 >> 2u;
  282. while(blkCnt > 0u)
  283. {
  284. /* Set all accumulators to zero */
  285. acc0 = 0.0f;
  286. acc1 = 0.0f;
  287. acc2 = 0.0f;
  288. acc3 = 0.0f;
  289. /* read x[0], x[1], x[2] samples */
  290. x0 = *(px++);
  291. x1 = *(px++);
  292. x2 = *(px++);
  293. /* Apply loop unrolling and compute 4 MACs simultaneously. */
  294. k = srcBLen >> 2u;
  295. /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
  296. ** a second loop below computes MACs for the remaining 1 to 3 samples. */
  297. do
  298. {
  299. /* Read y[0] sample */
  300. c0 = *(py++);
  301. /* Read x[3] sample */
  302. x3 = *(px++);
  303. /* Perform the multiply-accumulate */
  304. /* acc0 += x[0] * y[0] */
  305. acc0 += x0 * c0;
  306. /* acc1 += x[1] * y[0] */
  307. acc1 += x1 * c0;
  308. /* acc2 += x[2] * y[0] */
  309. acc2 += x2 * c0;
  310. /* acc3 += x[3] * y[0] */
  311. acc3 += x3 * c0;
  312. /* Read y[1] sample */
  313. c0 = *(py++);
  314. /* Read x[4] sample */
  315. x0 = *(px++);
  316. /* Perform the multiply-accumulate */
  317. /* acc0 += x[1] * y[1] */
  318. acc0 += x1 * c0;
  319. /* acc1 += x[2] * y[1] */
  320. acc1 += x2 * c0;
  321. /* acc2 += x[3] * y[1] */
  322. acc2 += x3 * c0;
  323. /* acc3 += x[4] * y[1] */
  324. acc3 += x0 * c0;
  325. /* Read y[2] sample */
  326. c0 = *(py++);
  327. /* Read x[5] sample */
  328. x1 = *(px++);
  329. /* Perform the multiply-accumulates */
  330. /* acc0 += x[2] * y[2] */
  331. acc0 += x2 * c0;
  332. /* acc1 += x[3] * y[2] */
  333. acc1 += x3 * c0;
  334. /* acc2 += x[4] * y[2] */
  335. acc2 += x0 * c0;
  336. /* acc3 += x[5] * y[2] */
  337. acc3 += x1 * c0;
  338. /* Read y[3] sample */
  339. c0 = *(py++);
  340. /* Read x[6] sample */
  341. x2 = *(px++);
  342. /* Perform the multiply-accumulates */
  343. /* acc0 += x[3] * y[3] */
  344. acc0 += x3 * c0;
  345. /* acc1 += x[4] * y[3] */
  346. acc1 += x0 * c0;
  347. /* acc2 += x[5] * y[3] */
  348. acc2 += x1 * c0;
  349. /* acc3 += x[6] * y[3] */
  350. acc3 += x2 * c0;
  351. } while(--k);
  352. /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
  353. ** No loop unrolling is used. */
  354. k = srcBLen % 0x4u;
  355. while(k > 0u)
  356. {
  357. /* Read y[4] sample */
  358. c0 = *(py++);
  359. /* Read x[7] sample */
  360. x3 = *(px++);
  361. /* Perform the multiply-accumulates */
  362. /* acc0 += x[4] * y[4] */
  363. acc0 += x0 * c0;
  364. /* acc1 += x[5] * y[4] */
  365. acc1 += x1 * c0;
  366. /* acc2 += x[6] * y[4] */
  367. acc2 += x2 * c0;
  368. /* acc3 += x[7] * y[4] */
  369. acc3 += x3 * c0;
  370. /* Reuse the present samples for the next MAC */
  371. x0 = x1;
  372. x1 = x2;
  373. x2 = x3;
  374. /* Decrement the loop counter */
  375. k--;
  376. }
  377. /* Store the result in the accumulator in the destination buffer. */
  378. *pOut = acc0;
  379. /* Destination pointer is updated according to the address modifier, inc */
  380. pOut += inc;
  381. *pOut = acc1;
  382. pOut += inc;
  383. *pOut = acc2;
  384. pOut += inc;
  385. *pOut = acc3;
  386. pOut += inc;
  387. /* Increment the pointer pIn1 index, count by 4 */
  388. count += 4u;
  389. /* Update the inputA and inputB pointers for next MAC calculation */
  390. px = pIn1 + count;
  391. py = pIn2;
  392. /* Decrement the loop counter */
  393. blkCnt--;
  394. }
  395. /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
  396. ** No loop unrolling is used. */
  397. blkCnt = blockSize2 % 0x4u;
  398. while(blkCnt > 0u)
  399. {
  400. /* Accumulator is made zero for every iteration */
  401. sum = 0.0f;
  402. /* Apply loop unrolling and compute 4 MACs simultaneously. */
  403. k = srcBLen >> 2u;
  404. /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
  405. ** a second loop below computes MACs for the remaining 1 to 3 samples. */
  406. while(k > 0u)
  407. {
  408. /* Perform the multiply-accumulates */
  409. sum += *px++ * *py++;
  410. sum += *px++ * *py++;
  411. sum += *px++ * *py++;
  412. sum += *px++ * *py++;
  413. /* Decrement the loop counter */
  414. k--;
  415. }
  416. /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
  417. ** No loop unrolling is used. */
  418. k = srcBLen % 0x4u;
  419. while(k > 0u)
  420. {
  421. /* Perform the multiply-accumulate */
  422. sum += *px++ * *py++;
  423. /* Decrement the loop counter */
  424. k--;
  425. }
  426. /* Store the result in the accumulator in the destination buffer. */
  427. *pOut = sum;
  428. /* Destination pointer is updated according to the address modifier, inc */
  429. pOut += inc;
  430. /* Increment the pointer pIn1 index, count by 1 */
  431. count++;
  432. /* Update the inputA and inputB pointers for next MAC calculation */
  433. px = pIn1 + count;
  434. py = pIn2;
  435. /* Decrement the loop counter */
  436. blkCnt--;
  437. }
  438. }
  439. else
  440. {
  441. /* If the srcBLen is not a multiple of 4,
  442. * the blockSize2 loop cannot be unrolled by 4 */
  443. blkCnt = blockSize2;
  444. while(blkCnt > 0u)
  445. {
  446. /* Accumulator is made zero for every iteration */
  447. sum = 0.0f;
  448. /* Loop over srcBLen */
  449. k = srcBLen;
  450. while(k > 0u)
  451. {
  452. /* Perform the multiply-accumulate */
  453. sum += *px++ * *py++;
  454. /* Decrement the loop counter */
  455. k--;
  456. }
  457. /* Store the result in the accumulator in the destination buffer. */
  458. *pOut = sum;
  459. /* Destination pointer is updated according to the address modifier, inc */
  460. pOut += inc;
  461. /* Increment the pointer pIn1 index, count by 1 */
  462. count++;
  463. /* Update the inputA and inputB pointers for next MAC calculation */
  464. px = pIn1 + count;
  465. py = pIn2;
  466. /* Decrement the loop counter */
  467. blkCnt--;
  468. }
  469. }
  470. /* --------------------------
  471. * Initializations of stage3
  472. * -------------------------*/
  473. /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
  474. * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
  475. * ....
  476. * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
  477. * sum += x[srcALen-1] * y[0]
  478. */
  479. /* In this stage the MAC operations are decreased by 1 for every iteration.
  480. The count variable holds the number of MAC operations performed */
  481. count = srcBLen - 1u;
  482. /* Working pointer of inputA */
  483. pSrc1 = pIn1 + (srcALen - (srcBLen - 1u));
  484. px = pSrc1;
  485. /* Working pointer of inputB */
  486. py = pIn2;
  487. /* -------------------
  488. * Stage3 process
  489. * ------------------*/
  490. while(blockSize3 > 0u)
  491. {
  492. /* Accumulator is made zero for every iteration */
  493. sum = 0.0f;
  494. /* Apply loop unrolling and compute 4 MACs simultaneously. */
  495. k = count >> 2u;
  496. /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
  497. ** a second loop below computes MACs for the remaining 1 to 3 samples. */
  498. while(k > 0u)
  499. {
  500. /* Perform the multiply-accumulates */
  501. /* sum += x[srcALen - srcBLen + 4] * y[3] */
  502. sum += *px++ * *py++;
  503. /* sum += x[srcALen - srcBLen + 3] * y[2] */
  504. sum += *px++ * *py++;
  505. /* sum += x[srcALen - srcBLen + 2] * y[1] */
  506. sum += *px++ * *py++;
  507. /* sum += x[srcALen - srcBLen + 1] * y[0] */
  508. sum += *px++ * *py++;
  509. /* Decrement the loop counter */
  510. k--;
  511. }
  512. /* If the count is not a multiple of 4, compute any remaining MACs here.
  513. ** No loop unrolling is used. */
  514. k = count % 0x4u;
  515. while(k > 0u)
  516. {
  517. /* Perform the multiply-accumulates */
  518. sum += *px++ * *py++;
  519. /* Decrement the loop counter */
  520. k--;
  521. }
  522. /* Store the result in the accumulator in the destination buffer. */
  523. *pOut = sum;
  524. /* Destination pointer is updated according to the address modifier, inc */
  525. pOut += inc;
  526. /* Update the inputA and inputB pointers for next MAC calculation */
  527. px = ++pSrc1;
  528. py = pIn2;
  529. /* Decrement the MAC count */
  530. count--;
  531. /* Decrement the loop counter */
  532. blockSize3--;
  533. }
  534. #else
  535. /* Run the below code for Cortex-M0 */
  536. float32_t *pIn1 = pSrcA; /* inputA pointer */
  537. float32_t *pIn2 = pSrcB + (srcBLen - 1u); /* inputB pointer */
  538. float32_t sum; /* Accumulator */
  539. uint32_t i = 0u, j; /* loop counters */
  540. uint32_t inv = 0u; /* Reverse order flag */
  541. uint32_t tot = 0u; /* Length */
  542. /* The algorithm implementation is based on the lengths of the inputs. */
  543. /* srcB is always made to slide across srcA. */
  544. /* So srcBLen is always considered as shorter or equal to srcALen */
  545. /* But CORR(x, y) is reverse of CORR(y, x) */
  546. /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
  547. /* and a varaible, inv is set to 1 */
  548. /* If lengths are not equal then zero pad has to be done to make the two
  549. * inputs of same length. But to improve the performance, we include zeroes
  550. * in the output instead of zero padding either of the the inputs*/
  551. /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the
  552. * starting of the output buffer */
  553. /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the
  554. * ending of the output buffer */
  555. /* Once the zero padding is done the remaining of the output is calcualted
  556. * using convolution but with the shorter signal time shifted. */
  557. /* Calculate the length of the remaining sequence */
  558. tot = ((srcALen + srcBLen) - 2u);
  559. if(srcALen > srcBLen)
  560. {
  561. /* Calculating the number of zeros to be padded to the output */
  562. j = srcALen - srcBLen;
  563. /* Initialise the pointer after zero padding */
  564. pDst += j;
  565. }
  566. else if(srcALen < srcBLen)
  567. {
  568. /* Initialization to inputB pointer */
  569. pIn1 = pSrcB;
  570. /* Initialization to the end of inputA pointer */
  571. pIn2 = pSrcA + (srcALen - 1u);
  572. /* Initialisation of the pointer after zero padding */
  573. pDst = pDst + tot;
  574. /* Swapping the lengths */
  575. j = srcALen;
  576. srcALen = srcBLen;
  577. srcBLen = j;
  578. /* Setting the reverse flag */
  579. inv = 1;
  580. }
  581. /* Loop to calculate convolution for output length number of times */
  582. for (i = 0u; i <= tot; i++)
  583. {
  584. /* Initialize sum with zero to carry on MAC operations */
  585. sum = 0.0f;
  586. /* Loop to perform MAC operations according to convolution equation */
  587. for (j = 0u; j <= i; j++)
  588. {
  589. /* Check the array limitations */
  590. if((((i - j) < srcBLen) && (j < srcALen)))
  591. {
  592. /* z[i] += x[i-j] * y[j] */
  593. sum += pIn1[j] * pIn2[-((int32_t) i - j)];
  594. }
  595. }
  596. /* Store the output in the destination buffer */
  597. if(inv == 1)
  598. *pDst-- = sum;
  599. else
  600. *pDst++ = sum;
  601. }
  602. #endif /* #ifndef ARM_MATH_CM0_FAMILY */
  603. }
  604. /**
  605. * @} end of Corr group
  606. */