Keyboard firmwares for Atmel AVR and Cortex-M
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

arm_correlate_fast_q15.c 36KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319
  1. /* ----------------------------------------------------------------------
  2. * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
  3. *
  4. * $Date: 17. January 2013
  5. * $Revision: V1.4.1
  6. *
  7. * Project: CMSIS DSP Library
  8. * Title: arm_correlate_fast_q15.c
  9. *
  10. * Description: Fast Q15 Correlation.
  11. *
  12. * Target Processor: Cortex-M4/Cortex-M3
  13. *
  14. * Redistribution and use in source and binary forms, with or without
  15. * modification, are permitted provided that the following conditions
  16. * are met:
  17. * - Redistributions of source code must retain the above copyright
  18. * notice, this list of conditions and the following disclaimer.
  19. * - Redistributions in binary form must reproduce the above copyright
  20. * notice, this list of conditions and the following disclaimer in
  21. * the documentation and/or other materials provided with the
  22. * distribution.
  23. * - Neither the name of ARM LIMITED nor the names of its contributors
  24. * may be used to endorse or promote products derived from this
  25. * software without specific prior written permission.
  26. *
  27. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  30. * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  31. * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  32. * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  33. * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  34. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  35. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  36. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  37. * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38. * POSSIBILITY OF SUCH DAMAGE.
  39. * -------------------------------------------------------------------- */
  40. #include "arm_math.h"
  41. /**
  42. * @ingroup groupFilters
  43. */
  44. /**
  45. * @addtogroup Corr
  46. * @{
  47. */
  48. /**
  49. * @brief Correlation of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.
  50. * @param[in] *pSrcA points to the first input sequence.
  51. * @param[in] srcALen length of the first input sequence.
  52. * @param[in] *pSrcB points to the second input sequence.
  53. * @param[in] srcBLen length of the second input sequence.
  54. * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
  55. * @return none.
  56. *
  57. * <b>Scaling and Overflow Behavior:</b>
  58. *
  59. * \par
  60. * This fast version uses a 32-bit accumulator with 2.30 format.
  61. * The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.
  62. * There is no saturation on intermediate additions.
  63. * Thus, if the accumulator overflows it wraps around and distorts the result.
  64. * The input signals should be scaled down to avoid intermediate overflows.
  65. * Scale down one of the inputs by 1/min(srcALen, srcBLen) to avoid overflow since a
  66. * maximum of min(srcALen, srcBLen) number of additions is carried internally.
  67. * The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.
  68. *
  69. * \par
  70. * See <code>arm_correlate_q15()</code> for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion.
  71. */
  72. void arm_correlate_fast_q15(
  73. q15_t * pSrcA,
  74. uint32_t srcALen,
  75. q15_t * pSrcB,
  76. uint32_t srcBLen,
  77. q15_t * pDst)
  78. {
  79. #ifndef UNALIGNED_SUPPORT_DISABLE
  80. q15_t *pIn1; /* inputA pointer */
  81. q15_t *pIn2; /* inputB pointer */
  82. q15_t *pOut = pDst; /* output pointer */
  83. q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
  84. q15_t *px; /* Intermediate inputA pointer */
  85. q15_t *py; /* Intermediate inputB pointer */
  86. q15_t *pSrc1; /* Intermediate pointers */
  87. q31_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */
  88. uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */
  89. int32_t inc = 1; /* Destination address modifier */
  90. /* The algorithm implementation is based on the lengths of the inputs. */
  91. /* srcB is always made to slide across srcA. */
  92. /* So srcBLen is always considered as shorter or equal to srcALen */
  93. /* But CORR(x, y) is reverse of CORR(y, x) */
  94. /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
  95. /* and the destination pointer modifier, inc is set to -1 */
  96. /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
  97. /* But to improve the performance,
  98. * we include zeroes in the output instead of zero padding either of the the inputs*/
  99. /* If srcALen > srcBLen,
  100. * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
  101. /* If srcALen < srcBLen,
  102. * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
  103. if(srcALen >= srcBLen)
  104. {
  105. /* Initialization of inputA pointer */
  106. pIn1 = (pSrcA);
  107. /* Initialization of inputB pointer */
  108. pIn2 = (pSrcB);
  109. /* Number of output samples is calculated */
  110. outBlockSize = (2u * srcALen) - 1u;
  111. /* When srcALen > srcBLen, zero padding is done to srcB
  112. * to make their lengths equal.
  113. * Instead, (outBlockSize - (srcALen + srcBLen - 1))
  114. * number of output samples are made zero */
  115. j = outBlockSize - (srcALen + (srcBLen - 1u));
  116. /* Updating the pointer position to non zero value */
  117. pOut += j;
  118. }
  119. else
  120. {
  121. /* Initialization of inputA pointer */
  122. pIn1 = (pSrcB);
  123. /* Initialization of inputB pointer */
  124. pIn2 = (pSrcA);
  125. /* srcBLen is always considered as shorter or equal to srcALen */
  126. j = srcBLen;
  127. srcBLen = srcALen;
  128. srcALen = j;
  129. /* CORR(x, y) = Reverse order(CORR(y, x)) */
  130. /* Hence set the destination pointer to point to the last output sample */
  131. pOut = pDst + ((srcALen + srcBLen) - 2u);
  132. /* Destination address modifier is set to -1 */
  133. inc = -1;
  134. }
  135. /* The function is internally
  136. * divided into three parts according to the number of multiplications that has to be
  137. * taken place between inputA samples and inputB samples. In the first part of the
  138. * algorithm, the multiplications increase by one for every iteration.
  139. * In the second part of the algorithm, srcBLen number of multiplications are done.
  140. * In the third part of the algorithm, the multiplications decrease by one
  141. * for every iteration.*/
  142. /* The algorithm is implemented in three stages.
  143. * The loop counters of each stage is initiated here. */
  144. blockSize1 = srcBLen - 1u;
  145. blockSize2 = srcALen - (srcBLen - 1u);
  146. blockSize3 = blockSize1;
  147. /* --------------------------
  148. * Initializations of stage1
  149. * -------------------------*/
  150. /* sum = x[0] * y[srcBlen - 1]
  151. * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
  152. * ....
  153. * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
  154. */
  155. /* In this stage the MAC operations are increased by 1 for every iteration.
  156. The count variable holds the number of MAC operations performed */
  157. count = 1u;
  158. /* Working pointer of inputA */
  159. px = pIn1;
  160. /* Working pointer of inputB */
  161. pSrc1 = pIn2 + (srcBLen - 1u);
  162. py = pSrc1;
  163. /* ------------------------
  164. * Stage1 process
  165. * ----------------------*/
  166. /* The first loop starts here */
  167. while(blockSize1 > 0u)
  168. {
  169. /* Accumulator is made zero for every iteration */
  170. sum = 0;
  171. /* Apply loop unrolling and compute 4 MACs simultaneously. */
  172. k = count >> 2;
  173. /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
  174. ** a second loop below computes MACs for the remaining 1 to 3 samples. */
  175. while(k > 0u)
  176. {
  177. /* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */
  178. sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
  179. /* x[3] * y[srcBLen - 1] , x[2] * y[srcBLen - 2] */
  180. sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
  181. /* Decrement the loop counter */
  182. k--;
  183. }
  184. /* If the count is not a multiple of 4, compute any remaining MACs here.
  185. ** No loop unrolling is used. */
  186. k = count % 0x4u;
  187. while(k > 0u)
  188. {
  189. /* Perform the multiply-accumulates */
  190. /* x[0] * y[srcBLen - 1] */
  191. sum = __SMLAD(*px++, *py++, sum);
  192. /* Decrement the loop counter */
  193. k--;
  194. }
  195. /* Store the result in the accumulator in the destination buffer. */
  196. *pOut = (q15_t) (sum >> 15);
  197. /* Destination pointer is updated according to the address modifier, inc */
  198. pOut += inc;
  199. /* Update the inputA and inputB pointers for next MAC calculation */
  200. py = pSrc1 - count;
  201. px = pIn1;
  202. /* Increment the MAC count */
  203. count++;
  204. /* Decrement the loop counter */
  205. blockSize1--;
  206. }
  207. /* --------------------------
  208. * Initializations of stage2
  209. * ------------------------*/
  210. /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
  211. * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
  212. * ....
  213. * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
  214. */
  215. /* Working pointer of inputA */
  216. px = pIn1;
  217. /* Working pointer of inputB */
  218. py = pIn2;
  219. /* count is index by which the pointer pIn1 to be incremented */
  220. count = 0u;
  221. /* -------------------
  222. * Stage2 process
  223. * ------------------*/
  224. /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
  225. * So, to loop unroll over blockSize2,
  226. * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */
  227. if(srcBLen >= 4u)
  228. {
  229. /* Loop unroll over blockSize2, by 4 */
  230. blkCnt = blockSize2 >> 2u;
  231. while(blkCnt > 0u)
  232. {
  233. /* Set all accumulators to zero */
  234. acc0 = 0;
  235. acc1 = 0;
  236. acc2 = 0;
  237. acc3 = 0;
  238. /* read x[0], x[1] samples */
  239. x0 = *__SIMD32(px);
  240. /* read x[1], x[2] samples */
  241. x1 = _SIMD32_OFFSET(px + 1);
  242. px += 2u;
  243. /* Apply loop unrolling and compute 4 MACs simultaneously. */
  244. k = srcBLen >> 2u;
  245. /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
  246. ** a second loop below computes MACs for the remaining 1 to 3 samples. */
  247. do
  248. {
  249. /* Read the first two inputB samples using SIMD:
  250. * y[0] and y[1] */
  251. c0 = *__SIMD32(py)++;
  252. /* acc0 += x[0] * y[0] + x[1] * y[1] */
  253. acc0 = __SMLAD(x0, c0, acc0);
  254. /* acc1 += x[1] * y[0] + x[2] * y[1] */
  255. acc1 = __SMLAD(x1, c0, acc1);
  256. /* Read x[2], x[3] */
  257. x2 = *__SIMD32(px);
  258. /* Read x[3], x[4] */
  259. x3 = _SIMD32_OFFSET(px + 1);
  260. /* acc2 += x[2] * y[0] + x[3] * y[1] */
  261. acc2 = __SMLAD(x2, c0, acc2);
  262. /* acc3 += x[3] * y[0] + x[4] * y[1] */
  263. acc3 = __SMLAD(x3, c0, acc3);
  264. /* Read y[2] and y[3] */
  265. c0 = *__SIMD32(py)++;
  266. /* acc0 += x[2] * y[2] + x[3] * y[3] */
  267. acc0 = __SMLAD(x2, c0, acc0);
  268. /* acc1 += x[3] * y[2] + x[4] * y[3] */
  269. acc1 = __SMLAD(x3, c0, acc1);
  270. /* Read x[4], x[5] */
  271. x0 = _SIMD32_OFFSET(px + 2);
  272. /* Read x[5], x[6] */
  273. x1 = _SIMD32_OFFSET(px + 3);
  274. px += 4u;
  275. /* acc2 += x[4] * y[2] + x[5] * y[3] */
  276. acc2 = __SMLAD(x0, c0, acc2);
  277. /* acc3 += x[5] * y[2] + x[6] * y[3] */
  278. acc3 = __SMLAD(x1, c0, acc3);
  279. } while(--k);
  280. /* For the next MAC operations, SIMD is not used
  281. * So, the 16 bit pointer if inputB, py is updated */
  282. /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
  283. ** No loop unrolling is used. */
  284. k = srcBLen % 0x4u;
  285. if(k == 1u)
  286. {
  287. /* Read y[4] */
  288. c0 = *py;
  289. #ifdef ARM_MATH_BIG_ENDIAN
  290. c0 = c0 << 16u;
  291. #else
  292. c0 = c0 & 0x0000FFFF;
  293. #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
  294. /* Read x[7] */
  295. x3 = *__SIMD32(px);
  296. px++;
  297. /* Perform the multiply-accumulates */
  298. acc0 = __SMLAD(x0, c0, acc0);
  299. acc1 = __SMLAD(x1, c0, acc1);
  300. acc2 = __SMLADX(x1, c0, acc2);
  301. acc3 = __SMLADX(x3, c0, acc3);
  302. }
  303. if(k == 2u)
  304. {
  305. /* Read y[4], y[5] */
  306. c0 = *__SIMD32(py);
  307. /* Read x[7], x[8] */
  308. x3 = *__SIMD32(px);
  309. /* Read x[9] */
  310. x2 = _SIMD32_OFFSET(px + 1);
  311. px += 2u;
  312. /* Perform the multiply-accumulates */
  313. acc0 = __SMLAD(x0, c0, acc0);
  314. acc1 = __SMLAD(x1, c0, acc1);
  315. acc2 = __SMLAD(x3, c0, acc2);
  316. acc3 = __SMLAD(x2, c0, acc3);
  317. }
  318. if(k == 3u)
  319. {
  320. /* Read y[4], y[5] */
  321. c0 = *__SIMD32(py)++;
  322. /* Read x[7], x[8] */
  323. x3 = *__SIMD32(px);
  324. /* Read x[9] */
  325. x2 = _SIMD32_OFFSET(px + 1);
  326. /* Perform the multiply-accumulates */
  327. acc0 = __SMLAD(x0, c0, acc0);
  328. acc1 = __SMLAD(x1, c0, acc1);
  329. acc2 = __SMLAD(x3, c0, acc2);
  330. acc3 = __SMLAD(x2, c0, acc3);
  331. c0 = (*py);
  332. /* Read y[6] */
  333. #ifdef ARM_MATH_BIG_ENDIAN
  334. c0 = c0 << 16u;
  335. #else
  336. c0 = c0 & 0x0000FFFF;
  337. #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
  338. /* Read x[10] */
  339. x3 = _SIMD32_OFFSET(px + 2);
  340. px += 3u;
  341. /* Perform the multiply-accumulates */
  342. acc0 = __SMLADX(x1, c0, acc0);
  343. acc1 = __SMLAD(x2, c0, acc1);
  344. acc2 = __SMLADX(x2, c0, acc2);
  345. acc3 = __SMLADX(x3, c0, acc3);
  346. }
  347. /* Store the result in the accumulator in the destination buffer. */
  348. *pOut = (q15_t) (acc0 >> 15);
  349. /* Destination pointer is updated according to the address modifier, inc */
  350. pOut += inc;
  351. *pOut = (q15_t) (acc1 >> 15);
  352. pOut += inc;
  353. *pOut = (q15_t) (acc2 >> 15);
  354. pOut += inc;
  355. *pOut = (q15_t) (acc3 >> 15);
  356. pOut += inc;
  357. /* Increment the pointer pIn1 index, count by 1 */
  358. count += 4u;
  359. /* Update the inputA and inputB pointers for next MAC calculation */
  360. px = pIn1 + count;
  361. py = pIn2;
  362. /* Decrement the loop counter */
  363. blkCnt--;
  364. }
  365. /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
  366. ** No loop unrolling is used. */
  367. blkCnt = blockSize2 % 0x4u;
  368. while(blkCnt > 0u)
  369. {
  370. /* Accumulator is made zero for every iteration */
  371. sum = 0;
  372. /* Apply loop unrolling and compute 4 MACs simultaneously. */
  373. k = srcBLen >> 2u;
  374. /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
  375. ** a second loop below computes MACs for the remaining 1 to 3 samples. */
  376. while(k > 0u)
  377. {
  378. /* Perform the multiply-accumulates */
  379. sum += ((q31_t) * px++ * *py++);
  380. sum += ((q31_t) * px++ * *py++);
  381. sum += ((q31_t) * px++ * *py++);
  382. sum += ((q31_t) * px++ * *py++);
  383. /* Decrement the loop counter */
  384. k--;
  385. }
  386. /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
  387. ** No loop unrolling is used. */
  388. k = srcBLen % 0x4u;
  389. while(k > 0u)
  390. {
  391. /* Perform the multiply-accumulates */
  392. sum += ((q31_t) * px++ * *py++);
  393. /* Decrement the loop counter */
  394. k--;
  395. }
  396. /* Store the result in the accumulator in the destination buffer. */
  397. *pOut = (q15_t) (sum >> 15);
  398. /* Destination pointer is updated according to the address modifier, inc */
  399. pOut += inc;
  400. /* Increment the pointer pIn1 index, count by 1 */
  401. count++;
  402. /* Update the inputA and inputB pointers for next MAC calculation */
  403. px = pIn1 + count;
  404. py = pIn2;
  405. /* Decrement the loop counter */
  406. blkCnt--;
  407. }
  408. }
  409. else
  410. {
  411. /* If the srcBLen is not a multiple of 4,
  412. * the blockSize2 loop cannot be unrolled by 4 */
  413. blkCnt = blockSize2;
  414. while(blkCnt > 0u)
  415. {
  416. /* Accumulator is made zero for every iteration */
  417. sum = 0;
  418. /* Loop over srcBLen */
  419. k = srcBLen;
  420. while(k > 0u)
  421. {
  422. /* Perform the multiply-accumulate */
  423. sum += ((q31_t) * px++ * *py++);
  424. /* Decrement the loop counter */
  425. k--;
  426. }
  427. /* Store the result in the accumulator in the destination buffer. */
  428. *pOut = (q15_t) (sum >> 15);
  429. /* Destination pointer is updated according to the address modifier, inc */
  430. pOut += inc;
  431. /* Increment the MAC count */
  432. count++;
  433. /* Update the inputA and inputB pointers for next MAC calculation */
  434. px = pIn1 + count;
  435. py = pIn2;
  436. /* Decrement the loop counter */
  437. blkCnt--;
  438. }
  439. }
  440. /* --------------------------
  441. * Initializations of stage3
  442. * -------------------------*/
  443. /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
  444. * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
  445. * ....
  446. * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
  447. * sum += x[srcALen-1] * y[0]
  448. */
  449. /* In this stage the MAC operations are decreased by 1 for every iteration.
  450. The count variable holds the number of MAC operations performed */
  451. count = srcBLen - 1u;
  452. /* Working pointer of inputA */
  453. pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
  454. px = pSrc1;
  455. /* Working pointer of inputB */
  456. py = pIn2;
  457. /* -------------------
  458. * Stage3 process
  459. * ------------------*/
  460. while(blockSize3 > 0u)
  461. {
  462. /* Accumulator is made zero for every iteration */
  463. sum = 0;
  464. /* Apply loop unrolling and compute 4 MACs simultaneously. */
  465. k = count >> 2u;
  466. /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
  467. ** a second loop below computes MACs for the remaining 1 to 3 samples. */
  468. while(k > 0u)
  469. {
  470. /* Perform the multiply-accumulates */
  471. /* sum += x[srcALen - srcBLen + 4] * y[3] , sum += x[srcALen - srcBLen + 3] * y[2] */
  472. sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
  473. /* sum += x[srcALen - srcBLen + 2] * y[1] , sum += x[srcALen - srcBLen + 1] * y[0] */
  474. sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
  475. /* Decrement the loop counter */
  476. k--;
  477. }
  478. /* If the count is not a multiple of 4, compute any remaining MACs here.
  479. ** No loop unrolling is used. */
  480. k = count % 0x4u;
  481. while(k > 0u)
  482. {
  483. /* Perform the multiply-accumulates */
  484. sum = __SMLAD(*px++, *py++, sum);
  485. /* Decrement the loop counter */
  486. k--;
  487. }
  488. /* Store the result in the accumulator in the destination buffer. */
  489. *pOut = (q15_t) (sum >> 15);
  490. /* Destination pointer is updated according to the address modifier, inc */
  491. pOut += inc;
  492. /* Update the inputA and inputB pointers for next MAC calculation */
  493. px = ++pSrc1;
  494. py = pIn2;
  495. /* Decrement the MAC count */
  496. count--;
  497. /* Decrement the loop counter */
  498. blockSize3--;
  499. }
  500. #else
  501. q15_t *pIn1; /* inputA pointer */
  502. q15_t *pIn2; /* inputB pointer */
  503. q15_t *pOut = pDst; /* output pointer */
  504. q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
  505. q15_t *px; /* Intermediate inputA pointer */
  506. q15_t *py; /* Intermediate inputB pointer */
  507. q15_t *pSrc1; /* Intermediate pointers */
  508. q31_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */
  509. uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */
  510. int32_t inc = 1; /* Destination address modifier */
  511. q15_t a, b;
  512. /* The algorithm implementation is based on the lengths of the inputs. */
  513. /* srcB is always made to slide across srcA. */
  514. /* So srcBLen is always considered as shorter or equal to srcALen */
  515. /* But CORR(x, y) is reverse of CORR(y, x) */
  516. /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
  517. /* and the destination pointer modifier, inc is set to -1 */
  518. /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
  519. /* But to improve the performance,
  520. * we include zeroes in the output instead of zero padding either of the the inputs*/
  521. /* If srcALen > srcBLen,
  522. * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
  523. /* If srcALen < srcBLen,
  524. * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
  525. if(srcALen >= srcBLen)
  526. {
  527. /* Initialization of inputA pointer */
  528. pIn1 = (pSrcA);
  529. /* Initialization of inputB pointer */
  530. pIn2 = (pSrcB);
  531. /* Number of output samples is calculated */
  532. outBlockSize = (2u * srcALen) - 1u;
  533. /* When srcALen > srcBLen, zero padding is done to srcB
  534. * to make their lengths equal.
  535. * Instead, (outBlockSize - (srcALen + srcBLen - 1))
  536. * number of output samples are made zero */
  537. j = outBlockSize - (srcALen + (srcBLen - 1u));
  538. /* Updating the pointer position to non zero value */
  539. pOut += j;
  540. }
  541. else
  542. {
  543. /* Initialization of inputA pointer */
  544. pIn1 = (pSrcB);
  545. /* Initialization of inputB pointer */
  546. pIn2 = (pSrcA);
  547. /* srcBLen is always considered as shorter or equal to srcALen */
  548. j = srcBLen;
  549. srcBLen = srcALen;
  550. srcALen = j;
  551. /* CORR(x, y) = Reverse order(CORR(y, x)) */
  552. /* Hence set the destination pointer to point to the last output sample */
  553. pOut = pDst + ((srcALen + srcBLen) - 2u);
  554. /* Destination address modifier is set to -1 */
  555. inc = -1;
  556. }
  557. /* The function is internally
  558. * divided into three parts according to the number of multiplications that has to be
  559. * taken place between inputA samples and inputB samples. In the first part of the
  560. * algorithm, the multiplications increase by one for every iteration.
  561. * In the second part of the algorithm, srcBLen number of multiplications are done.
  562. * In the third part of the algorithm, the multiplications decrease by one
  563. * for every iteration.*/
  564. /* The algorithm is implemented in three stages.
  565. * The loop counters of each stage is initiated here. */
  566. blockSize1 = srcBLen - 1u;
  567. blockSize2 = srcALen - (srcBLen - 1u);
  568. blockSize3 = blockSize1;
  569. /* --------------------------
  570. * Initializations of stage1
  571. * -------------------------*/
  572. /* sum = x[0] * y[srcBlen - 1]
  573. * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
  574. * ....
  575. * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
  576. */
  577. /* In this stage the MAC operations are increased by 1 for every iteration.
  578. The count variable holds the number of MAC operations performed */
  579. count = 1u;
  580. /* Working pointer of inputA */
  581. px = pIn1;
  582. /* Working pointer of inputB */
  583. pSrc1 = pIn2 + (srcBLen - 1u);
  584. py = pSrc1;
  585. /* ------------------------
  586. * Stage1 process
  587. * ----------------------*/
  588. /* The first loop starts here */
  589. while(blockSize1 > 0u)
  590. {
  591. /* Accumulator is made zero for every iteration */
  592. sum = 0;
  593. /* Apply loop unrolling and compute 4 MACs simultaneously. */
  594. k = count >> 2;
  595. /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
  596. ** a second loop below computes MACs for the remaining 1 to 3 samples. */
  597. while(k > 0u)
  598. {
  599. /* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */
  600. sum += ((q31_t) * px++ * *py++);
  601. sum += ((q31_t) * px++ * *py++);
  602. sum += ((q31_t) * px++ * *py++);
  603. sum += ((q31_t) * px++ * *py++);
  604. /* Decrement the loop counter */
  605. k--;
  606. }
  607. /* If the count is not a multiple of 4, compute any remaining MACs here.
  608. ** No loop unrolling is used. */
  609. k = count % 0x4u;
  610. while(k > 0u)
  611. {
  612. /* Perform the multiply-accumulates */
  613. /* x[0] * y[srcBLen - 1] */
  614. sum += ((q31_t) * px++ * *py++);
  615. /* Decrement the loop counter */
  616. k--;
  617. }
  618. /* Store the result in the accumulator in the destination buffer. */
  619. *pOut = (q15_t) (sum >> 15);
  620. /* Destination pointer is updated according to the address modifier, inc */
  621. pOut += inc;
  622. /* Update the inputA and inputB pointers for next MAC calculation */
  623. py = pSrc1 - count;
  624. px = pIn1;
  625. /* Increment the MAC count */
  626. count++;
  627. /* Decrement the loop counter */
  628. blockSize1--;
  629. }
  630. /* --------------------------
  631. * Initializations of stage2
  632. * ------------------------*/
  633. /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
  634. * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
  635. * ....
  636. * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
  637. */
  638. /* Working pointer of inputA */
  639. px = pIn1;
  640. /* Working pointer of inputB */
  641. py = pIn2;
  642. /* count is index by which the pointer pIn1 to be incremented */
  643. count = 0u;
  644. /* -------------------
  645. * Stage2 process
  646. * ------------------*/
  647. /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
  648. * So, to loop unroll over blockSize2,
  649. * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */
  650. if(srcBLen >= 4u)
  651. {
  652. /* Loop unroll over blockSize2, by 4 */
  653. blkCnt = blockSize2 >> 2u;
  654. while(blkCnt > 0u)
  655. {
  656. /* Set all accumulators to zero */
  657. acc0 = 0;
  658. acc1 = 0;
  659. acc2 = 0;
  660. acc3 = 0;
  661. /* read x[0], x[1], x[2] samples */
  662. a = *px;
  663. b = *(px + 1);
  664. #ifndef ARM_MATH_BIG_ENDIAN
  665. x0 = __PKHBT(a, b, 16);
  666. a = *(px + 2);
  667. x1 = __PKHBT(b, a, 16);
  668. #else
  669. x0 = __PKHBT(b, a, 16);
  670. a = *(px + 2);
  671. x1 = __PKHBT(a, b, 16);
  672. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  673. px += 2u;
  674. /* Apply loop unrolling and compute 4 MACs simultaneously. */
  675. k = srcBLen >> 2u;
  676. /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
  677. ** a second loop below computes MACs for the remaining 1 to 3 samples. */
  678. do
  679. {
  680. /* Read the first two inputB samples using SIMD:
  681. * y[0] and y[1] */
  682. a = *py;
  683. b = *(py + 1);
  684. #ifndef ARM_MATH_BIG_ENDIAN
  685. c0 = __PKHBT(a, b, 16);
  686. #else
  687. c0 = __PKHBT(b, a, 16);
  688. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  689. /* acc0 += x[0] * y[0] + x[1] * y[1] */
  690. acc0 = __SMLAD(x0, c0, acc0);
  691. /* acc1 += x[1] * y[0] + x[2] * y[1] */
  692. acc1 = __SMLAD(x1, c0, acc1);
  693. /* Read x[2], x[3], x[4] */
  694. a = *px;
  695. b = *(px + 1);
  696. #ifndef ARM_MATH_BIG_ENDIAN
  697. x2 = __PKHBT(a, b, 16);
  698. a = *(px + 2);
  699. x3 = __PKHBT(b, a, 16);
  700. #else
  701. x2 = __PKHBT(b, a, 16);
  702. a = *(px + 2);
  703. x3 = __PKHBT(a, b, 16);
  704. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  705. /* acc2 += x[2] * y[0] + x[3] * y[1] */
  706. acc2 = __SMLAD(x2, c0, acc2);
  707. /* acc3 += x[3] * y[0] + x[4] * y[1] */
  708. acc3 = __SMLAD(x3, c0, acc3);
  709. /* Read y[2] and y[3] */
  710. a = *(py + 2);
  711. b = *(py + 3);
  712. py += 4u;
  713. #ifndef ARM_MATH_BIG_ENDIAN
  714. c0 = __PKHBT(a, b, 16);
  715. #else
  716. c0 = __PKHBT(b, a, 16);
  717. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  718. /* acc0 += x[2] * y[2] + x[3] * y[3] */
  719. acc0 = __SMLAD(x2, c0, acc0);
  720. /* acc1 += x[3] * y[2] + x[4] * y[3] */
  721. acc1 = __SMLAD(x3, c0, acc1);
  722. /* Read x[4], x[5], x[6] */
  723. a = *(px + 2);
  724. b = *(px + 3);
  725. #ifndef ARM_MATH_BIG_ENDIAN
  726. x0 = __PKHBT(a, b, 16);
  727. a = *(px + 4);
  728. x1 = __PKHBT(b, a, 16);
  729. #else
  730. x0 = __PKHBT(b, a, 16);
  731. a = *(px + 4);
  732. x1 = __PKHBT(a, b, 16);
  733. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  734. px += 4u;
  735. /* acc2 += x[4] * y[2] + x[5] * y[3] */
  736. acc2 = __SMLAD(x0, c0, acc2);
  737. /* acc3 += x[5] * y[2] + x[6] * y[3] */
  738. acc3 = __SMLAD(x1, c0, acc3);
  739. } while(--k);
  740. /* For the next MAC operations, SIMD is not used
  741. * So, the 16 bit pointer if inputB, py is updated */
  742. /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
  743. ** No loop unrolling is used. */
  744. k = srcBLen % 0x4u;
  745. if(k == 1u)
  746. {
  747. /* Read y[4] */
  748. c0 = *py;
  749. #ifdef ARM_MATH_BIG_ENDIAN
  750. c0 = c0 << 16u;
  751. #else
  752. c0 = c0 & 0x0000FFFF;
  753. #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
  754. /* Read x[7] */
  755. a = *px;
  756. b = *(px + 1);
  757. px++;;
  758. #ifndef ARM_MATH_BIG_ENDIAN
  759. x3 = __PKHBT(a, b, 16);
  760. #else
  761. x3 = __PKHBT(b, a, 16);
  762. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  763. px++;
  764. /* Perform the multiply-accumulates */
  765. acc0 = __SMLAD(x0, c0, acc0);
  766. acc1 = __SMLAD(x1, c0, acc1);
  767. acc2 = __SMLADX(x1, c0, acc2);
  768. acc3 = __SMLADX(x3, c0, acc3);
  769. }
  770. if(k == 2u)
  771. {
  772. /* Read y[4], y[5] */
  773. a = *py;
  774. b = *(py + 1);
  775. #ifndef ARM_MATH_BIG_ENDIAN
  776. c0 = __PKHBT(a, b, 16);
  777. #else
  778. c0 = __PKHBT(b, a, 16);
  779. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  780. /* Read x[7], x[8], x[9] */
  781. a = *px;
  782. b = *(px + 1);
  783. #ifndef ARM_MATH_BIG_ENDIAN
  784. x3 = __PKHBT(a, b, 16);
  785. a = *(px + 2);
  786. x2 = __PKHBT(b, a, 16);
  787. #else
  788. x3 = __PKHBT(b, a, 16);
  789. a = *(px + 2);
  790. x2 = __PKHBT(a, b, 16);
  791. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  792. px += 2u;
  793. /* Perform the multiply-accumulates */
  794. acc0 = __SMLAD(x0, c0, acc0);
  795. acc1 = __SMLAD(x1, c0, acc1);
  796. acc2 = __SMLAD(x3, c0, acc2);
  797. acc3 = __SMLAD(x2, c0, acc3);
  798. }
  799. if(k == 3u)
  800. {
  801. /* Read y[4], y[5] */
  802. a = *py;
  803. b = *(py + 1);
  804. #ifndef ARM_MATH_BIG_ENDIAN
  805. c0 = __PKHBT(a, b, 16);
  806. #else
  807. c0 = __PKHBT(b, a, 16);
  808. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  809. py += 2u;
  810. /* Read x[7], x[8], x[9] */
  811. a = *px;
  812. b = *(px + 1);
  813. #ifndef ARM_MATH_BIG_ENDIAN
  814. x3 = __PKHBT(a, b, 16);
  815. a = *(px + 2);
  816. x2 = __PKHBT(b, a, 16);
  817. #else
  818. x3 = __PKHBT(b, a, 16);
  819. a = *(px + 2);
  820. x2 = __PKHBT(a, b, 16);
  821. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  822. /* Perform the multiply-accumulates */
  823. acc0 = __SMLAD(x0, c0, acc0);
  824. acc1 = __SMLAD(x1, c0, acc1);
  825. acc2 = __SMLAD(x3, c0, acc2);
  826. acc3 = __SMLAD(x2, c0, acc3);
  827. c0 = (*py);
  828. /* Read y[6] */
  829. #ifdef ARM_MATH_BIG_ENDIAN
  830. c0 = c0 << 16u;
  831. #else
  832. c0 = c0 & 0x0000FFFF;
  833. #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
  834. /* Read x[10] */
  835. b = *(px + 3);
  836. #ifndef ARM_MATH_BIG_ENDIAN
  837. x3 = __PKHBT(a, b, 16);
  838. #else
  839. x3 = __PKHBT(b, a, 16);
  840. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  841. px += 3u;
  842. /* Perform the multiply-accumulates */
  843. acc0 = __SMLADX(x1, c0, acc0);
  844. acc1 = __SMLAD(x2, c0, acc1);
  845. acc2 = __SMLADX(x2, c0, acc2);
  846. acc3 = __SMLADX(x3, c0, acc3);
  847. }
  848. /* Store the result in the accumulator in the destination buffer. */
  849. *pOut = (q15_t) (acc0 >> 15);
  850. /* Destination pointer is updated according to the address modifier, inc */
  851. pOut += inc;
  852. *pOut = (q15_t) (acc1 >> 15);
  853. pOut += inc;
  854. *pOut = (q15_t) (acc2 >> 15);
  855. pOut += inc;
  856. *pOut = (q15_t) (acc3 >> 15);
  857. pOut += inc;
  858. /* Increment the pointer pIn1 index, count by 1 */
  859. count += 4u;
  860. /* Update the inputA and inputB pointers for next MAC calculation */
  861. px = pIn1 + count;
  862. py = pIn2;
  863. /* Decrement the loop counter */
  864. blkCnt--;
  865. }
  866. /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
  867. ** No loop unrolling is used. */
  868. blkCnt = blockSize2 % 0x4u;
  869. while(blkCnt > 0u)
  870. {
  871. /* Accumulator is made zero for every iteration */
  872. sum = 0;
  873. /* Apply loop unrolling and compute 4 MACs simultaneously. */
  874. k = srcBLen >> 2u;
  875. /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
  876. ** a second loop below computes MACs for the remaining 1 to 3 samples. */
  877. while(k > 0u)
  878. {
  879. /* Perform the multiply-accumulates */
  880. sum += ((q31_t) * px++ * *py++);
  881. sum += ((q31_t) * px++ * *py++);
  882. sum += ((q31_t) * px++ * *py++);
  883. sum += ((q31_t) * px++ * *py++);
  884. /* Decrement the loop counter */
  885. k--;
  886. }
  887. /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
  888. ** No loop unrolling is used. */
  889. k = srcBLen % 0x4u;
  890. while(k > 0u)
  891. {
  892. /* Perform the multiply-accumulates */
  893. sum += ((q31_t) * px++ * *py++);
  894. /* Decrement the loop counter */
  895. k--;
  896. }
  897. /* Store the result in the accumulator in the destination buffer. */
  898. *pOut = (q15_t) (sum >> 15);
  899. /* Destination pointer is updated according to the address modifier, inc */
  900. pOut += inc;
  901. /* Increment the pointer pIn1 index, count by 1 */
  902. count++;
  903. /* Update the inputA and inputB pointers for next MAC calculation */
  904. px = pIn1 + count;
  905. py = pIn2;
  906. /* Decrement the loop counter */
  907. blkCnt--;
  908. }
  909. }
  910. else
  911. {
  912. /* If the srcBLen is not a multiple of 4,
  913. * the blockSize2 loop cannot be unrolled by 4 */
  914. blkCnt = blockSize2;
  915. while(blkCnt > 0u)
  916. {
  917. /* Accumulator is made zero for every iteration */
  918. sum = 0;
  919. /* Loop over srcBLen */
  920. k = srcBLen;
  921. while(k > 0u)
  922. {
  923. /* Perform the multiply-accumulate */
  924. sum += ((q31_t) * px++ * *py++);
  925. /* Decrement the loop counter */
  926. k--;
  927. }
  928. /* Store the result in the accumulator in the destination buffer. */
  929. *pOut = (q15_t) (sum >> 15);
  930. /* Destination pointer is updated according to the address modifier, inc */
  931. pOut += inc;
  932. /* Increment the MAC count */
  933. count++;
  934. /* Update the inputA and inputB pointers for next MAC calculation */
  935. px = pIn1 + count;
  936. py = pIn2;
  937. /* Decrement the loop counter */
  938. blkCnt--;
  939. }
  940. }
  941. /* --------------------------
  942. * Initializations of stage3
  943. * -------------------------*/
  944. /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
  945. * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
  946. * ....
  947. * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
  948. * sum += x[srcALen-1] * y[0]
  949. */
  950. /* In this stage the MAC operations are decreased by 1 for every iteration.
  951. The count variable holds the number of MAC operations performed */
  952. count = srcBLen - 1u;
  953. /* Working pointer of inputA */
  954. pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
  955. px = pSrc1;
  956. /* Working pointer of inputB */
  957. py = pIn2;
  958. /* -------------------
  959. * Stage3 process
  960. * ------------------*/
  961. while(blockSize3 > 0u)
  962. {
  963. /* Accumulator is made zero for every iteration */
  964. sum = 0;
  965. /* Apply loop unrolling and compute 4 MACs simultaneously. */
  966. k = count >> 2u;
  967. /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
  968. ** a second loop below computes MACs for the remaining 1 to 3 samples. */
  969. while(k > 0u)
  970. {
  971. /* Perform the multiply-accumulates */
  972. sum += ((q31_t) * px++ * *py++);
  973. sum += ((q31_t) * px++ * *py++);
  974. sum += ((q31_t) * px++ * *py++);
  975. sum += ((q31_t) * px++ * *py++);
  976. /* Decrement the loop counter */
  977. k--;
  978. }
  979. /* If the count is not a multiple of 4, compute any remaining MACs here.
  980. ** No loop unrolling is used. */
  981. k = count % 0x4u;
  982. while(k > 0u)
  983. {
  984. /* Perform the multiply-accumulates */
  985. sum += ((q31_t) * px++ * *py++);
  986. /* Decrement the loop counter */
  987. k--;
  988. }
  989. /* Store the result in the accumulator in the destination buffer. */
  990. *pOut = (q15_t) (sum >> 15);
  991. /* Destination pointer is updated according to the address modifier, inc */
  992. pOut += inc;
  993. /* Update the inputA and inputB pointers for next MAC calculation */
  994. px = ++pSrc1;
  995. py = pIn2;
  996. /* Decrement the MAC count */
  997. count--;
  998. /* Decrement the loop counter */
  999. blockSize3--;
  1000. }
  1001. #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
  1002. }
  1003. /**
  1004. * @} end of Corr group
  1005. */