Keyboard firmwares for Atmel AVR and Cortex-M
Nelze vybrat více než 25 témat Téma musí začínat písmenem nebo číslem, může obsahovat pomlčky („-“) a může být dlouhé až 35 znaků.

arm_conv_fast_q15.c 38KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410
  1. /* ----------------------------------------------------------------------
  2. * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
  3. *
  4. * $Date: 17. January 2013
  5. * $Revision: V1.4.1
  6. *
  7. * Project: CMSIS DSP Library
  8. * Title: arm_conv_fast_q15.c
  9. *
  10. * Description: Fast Q15 Convolution.
  11. *
  12. * Target Processor: Cortex-M4/Cortex-M3
  13. *
  14. * Redistribution and use in source and binary forms, with or without
  15. * modification, are permitted provided that the following conditions
  16. * are met:
  17. * - Redistributions of source code must retain the above copyright
  18. * notice, this list of conditions and the following disclaimer.
  19. * - Redistributions in binary form must reproduce the above copyright
  20. * notice, this list of conditions and the following disclaimer in
  21. * the documentation and/or other materials provided with the
  22. * distribution.
  23. * - Neither the name of ARM LIMITED nor the names of its contributors
  24. * may be used to endorse or promote products derived from this
  25. * software without specific prior written permission.
  26. *
  27. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  30. * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  31. * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  32. * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  33. * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  34. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  35. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  36. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  37. * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38. * POSSIBILITY OF SUCH DAMAGE.
  39. * -------------------------------------------------------------------- */
  40. #include "arm_math.h"
  41. /**
  42. * @ingroup groupFilters
  43. */
  44. /**
  45. * @addtogroup Conv
  46. * @{
  47. */
  48. /**
  49. * @brief Convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.
  50. * @param[in] *pSrcA points to the first input sequence.
  51. * @param[in] srcALen length of the first input sequence.
  52. * @param[in] *pSrcB points to the second input sequence.
  53. * @param[in] srcBLen length of the second input sequence.
  54. * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
  55. * @return none.
  56. *
  57. * <b>Scaling and Overflow Behavior:</b>
  58. *
  59. * \par
  60. * This fast version uses a 32-bit accumulator with 2.30 format.
  61. * The accumulator maintains full precision of the intermediate multiplication results
  62. * but provides only a single guard bit. There is no saturation on intermediate additions.
  63. * Thus, if the accumulator overflows it wraps around and distorts the result.
  64. * The input signals should be scaled down to avoid intermediate overflows.
  65. * Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,
  66. * as maximum of min(srcALen, srcBLen) number of additions are carried internally.
  67. * The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.
  68. *
  69. * \par
  70. * See <code>arm_conv_q15()</code> for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion.
  71. */
  72. void arm_conv_fast_q15(
  73. q15_t * pSrcA,
  74. uint32_t srcALen,
  75. q15_t * pSrcB,
  76. uint32_t srcBLen,
  77. q15_t * pDst)
  78. {
  79. #ifndef UNALIGNED_SUPPORT_DISABLE
  80. q15_t *pIn1; /* inputA pointer */
  81. q15_t *pIn2; /* inputB pointer */
  82. q15_t *pOut = pDst; /* output pointer */
  83. q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
  84. q15_t *px; /* Intermediate inputA pointer */
  85. q15_t *py; /* Intermediate inputB pointer */
  86. q15_t *pSrc1, *pSrc2; /* Intermediate pointers */
  87. q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
  88. uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt; /* loop counter */
  89. /* The algorithm implementation is based on the lengths of the inputs. */
  90. /* srcB is always made to slide across srcA. */
  91. /* So srcBLen is always considered as shorter or equal to srcALen */
  92. if(srcALen >= srcBLen)
  93. {
  94. /* Initialization of inputA pointer */
  95. pIn1 = pSrcA;
  96. /* Initialization of inputB pointer */
  97. pIn2 = pSrcB;
  98. }
  99. else
  100. {
  101. /* Initialization of inputA pointer */
  102. pIn1 = pSrcB;
  103. /* Initialization of inputB pointer */
  104. pIn2 = pSrcA;
  105. /* srcBLen is always considered as shorter or equal to srcALen */
  106. j = srcBLen;
  107. srcBLen = srcALen;
  108. srcALen = j;
  109. }
  110. /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
  111. /* The function is internally
  112. * divided into three stages according to the number of multiplications that has to be
  113. * taken place between inputA samples and inputB samples. In the first stage of the
  114. * algorithm, the multiplications increase by one for every iteration.
  115. * In the second stage of the algorithm, srcBLen number of multiplications are done.
  116. * In the third stage of the algorithm, the multiplications decrease by one
  117. * for every iteration. */
  118. /* The algorithm is implemented in three stages.
  119. The loop counters of each stage is initiated here. */
  120. blockSize1 = srcBLen - 1u;
  121. blockSize2 = srcALen - (srcBLen - 1u);
  122. blockSize3 = blockSize1;
  123. /* --------------------------
  124. * Initializations of stage1
  125. * -------------------------*/
  126. /* sum = x[0] * y[0]
  127. * sum = x[0] * y[1] + x[1] * y[0]
  128. * ....
  129. * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
  130. */
  131. /* In this stage the MAC operations are increased by 1 for every iteration.
  132. The count variable holds the number of MAC operations performed */
  133. count = 1u;
  134. /* Working pointer of inputA */
  135. px = pIn1;
  136. /* Working pointer of inputB */
  137. py = pIn2;
  138. /* ------------------------
  139. * Stage1 process
  140. * ----------------------*/
  141. /* For loop unrolling by 4, this stage is divided into two. */
  142. /* First part of this stage computes the MAC operations less than 4 */
  143. /* Second part of this stage computes the MAC operations greater than or equal to 4 */
  144. /* The first part of the stage starts here */
  145. while((count < 4u) && (blockSize1 > 0u))
  146. {
  147. /* Accumulator is made zero for every iteration */
  148. sum = 0;
  149. /* Loop over number of MAC operations between
  150. * inputA samples and inputB samples */
  151. k = count;
  152. while(k > 0u)
  153. {
  154. /* Perform the multiply-accumulates */
  155. sum = __SMLAD(*px++, *py--, sum);
  156. /* Decrement the loop counter */
  157. k--;
  158. }
  159. /* Store the result in the accumulator in the destination buffer. */
  160. *pOut++ = (q15_t) (sum >> 15);
  161. /* Update the inputA and inputB pointers for next MAC calculation */
  162. py = pIn2 + count;
  163. px = pIn1;
  164. /* Increment the MAC count */
  165. count++;
  166. /* Decrement the loop counter */
  167. blockSize1--;
  168. }
  169. /* The second part of the stage starts here */
  170. /* The internal loop, over count, is unrolled by 4 */
  171. /* To, read the last two inputB samples using SIMD:
  172. * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
  173. py = py - 1;
  174. while(blockSize1 > 0u)
  175. {
  176. /* Accumulator is made zero for every iteration */
  177. sum = 0;
  178. /* Apply loop unrolling and compute 4 MACs simultaneously. */
  179. k = count >> 2u;
  180. /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
  181. ** a second loop below computes MACs for the remaining 1 to 3 samples. */
  182. while(k > 0u)
  183. {
  184. /* Perform the multiply-accumulates */
  185. /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
  186. sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
  187. /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
  188. sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
  189. /* Decrement the loop counter */
  190. k--;
  191. }
  192. /* For the next MAC operations, the pointer py is used without SIMD
  193. * So, py is incremented by 1 */
  194. py = py + 1u;
  195. /* If the count is not a multiple of 4, compute any remaining MACs here.
  196. ** No loop unrolling is used. */
  197. k = count % 0x4u;
  198. while(k > 0u)
  199. {
  200. /* Perform the multiply-accumulates */
  201. sum = __SMLAD(*px++, *py--, sum);
  202. /* Decrement the loop counter */
  203. k--;
  204. }
  205. /* Store the result in the accumulator in the destination buffer. */
  206. *pOut++ = (q15_t) (sum >> 15);
  207. /* Update the inputA and inputB pointers for next MAC calculation */
  208. py = pIn2 + (count - 1u);
  209. px = pIn1;
  210. /* Increment the MAC count */
  211. count++;
  212. /* Decrement the loop counter */
  213. blockSize1--;
  214. }
  215. /* --------------------------
  216. * Initializations of stage2
  217. * ------------------------*/
  218. /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
  219. * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
  220. * ....
  221. * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
  222. */
  223. /* Working pointer of inputA */
  224. px = pIn1;
  225. /* Working pointer of inputB */
  226. pSrc2 = pIn2 + (srcBLen - 1u);
  227. py = pSrc2;
  228. /* count is the index by which the pointer pIn1 to be incremented */
  229. count = 0u;
  230. /* --------------------
  231. * Stage2 process
  232. * -------------------*/
  233. /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
  234. * So, to loop unroll over blockSize2,
  235. * srcBLen should be greater than or equal to 4 */
  236. if(srcBLen >= 4u)
  237. {
  238. /* Loop unroll over blockSize2, by 4 */
  239. blkCnt = blockSize2 >> 2u;
  240. while(blkCnt > 0u)
  241. {
  242. py = py - 1u;
  243. /* Set all accumulators to zero */
  244. acc0 = 0;
  245. acc1 = 0;
  246. acc2 = 0;
  247. acc3 = 0;
  248. /* read x[0], x[1] samples */
  249. x0 = *__SIMD32(px);
  250. /* read x[1], x[2] samples */
  251. x1 = _SIMD32_OFFSET(px+1);
  252. px+= 2u;
  253. /* Apply loop unrolling and compute 4 MACs simultaneously. */
  254. k = srcBLen >> 2u;
  255. /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
  256. ** a second loop below computes MACs for the remaining 1 to 3 samples. */
  257. do
  258. {
  259. /* Read the last two inputB samples using SIMD:
  260. * y[srcBLen - 1] and y[srcBLen - 2] */
  261. c0 = *__SIMD32(py)--;
  262. /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
  263. acc0 = __SMLADX(x0, c0, acc0);
  264. /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
  265. acc1 = __SMLADX(x1, c0, acc1);
  266. /* Read x[2], x[3] */
  267. x2 = *__SIMD32(px);
  268. /* Read x[3], x[4] */
  269. x3 = _SIMD32_OFFSET(px+1);
  270. /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
  271. acc2 = __SMLADX(x2, c0, acc2);
  272. /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
  273. acc3 = __SMLADX(x3, c0, acc3);
  274. /* Read y[srcBLen - 3] and y[srcBLen - 4] */
  275. c0 = *__SIMD32(py)--;
  276. /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
  277. acc0 = __SMLADX(x2, c0, acc0);
  278. /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
  279. acc1 = __SMLADX(x3, c0, acc1);
  280. /* Read x[4], x[5] */
  281. x0 = _SIMD32_OFFSET(px+2);
  282. /* Read x[5], x[6] */
  283. x1 = _SIMD32_OFFSET(px+3);
  284. px += 4u;
  285. /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
  286. acc2 = __SMLADX(x0, c0, acc2);
  287. /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
  288. acc3 = __SMLADX(x1, c0, acc3);
  289. } while(--k);
  290. /* For the next MAC operations, SIMD is not used
  291. * So, the 16 bit pointer if inputB, py is updated */
  292. /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
  293. ** No loop unrolling is used. */
  294. k = srcBLen % 0x4u;
  295. if(k == 1u)
  296. {
  297. /* Read y[srcBLen - 5] */
  298. c0 = *(py+1);
  299. #ifdef ARM_MATH_BIG_ENDIAN
  300. c0 = c0 << 16u;
  301. #else
  302. c0 = c0 & 0x0000FFFF;
  303. #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
  304. /* Read x[7] */
  305. x3 = *__SIMD32(px);
  306. px++;
  307. /* Perform the multiply-accumulates */
  308. acc0 = __SMLAD(x0, c0, acc0);
  309. acc1 = __SMLAD(x1, c0, acc1);
  310. acc2 = __SMLADX(x1, c0, acc2);
  311. acc3 = __SMLADX(x3, c0, acc3);
  312. }
  313. if(k == 2u)
  314. {
  315. /* Read y[srcBLen - 5], y[srcBLen - 6] */
  316. c0 = _SIMD32_OFFSET(py);
  317. /* Read x[7], x[8] */
  318. x3 = *__SIMD32(px);
  319. /* Read x[9] */
  320. x2 = _SIMD32_OFFSET(px+1);
  321. px += 2u;
  322. /* Perform the multiply-accumulates */
  323. acc0 = __SMLADX(x0, c0, acc0);
  324. acc1 = __SMLADX(x1, c0, acc1);
  325. acc2 = __SMLADX(x3, c0, acc2);
  326. acc3 = __SMLADX(x2, c0, acc3);
  327. }
  328. if(k == 3u)
  329. {
  330. /* Read y[srcBLen - 5], y[srcBLen - 6] */
  331. c0 = _SIMD32_OFFSET(py);
  332. /* Read x[7], x[8] */
  333. x3 = *__SIMD32(px);
  334. /* Read x[9] */
  335. x2 = _SIMD32_OFFSET(px+1);
  336. /* Perform the multiply-accumulates */
  337. acc0 = __SMLADX(x0, c0, acc0);
  338. acc1 = __SMLADX(x1, c0, acc1);
  339. acc2 = __SMLADX(x3, c0, acc2);
  340. acc3 = __SMLADX(x2, c0, acc3);
  341. /* Read y[srcBLen - 7] */
  342. c0 = *(py-1);
  343. #ifdef ARM_MATH_BIG_ENDIAN
  344. c0 = c0 << 16u;
  345. #else
  346. c0 = c0 & 0x0000FFFF;
  347. #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
  348. /* Read x[10] */
  349. x3 = _SIMD32_OFFSET(px+2);
  350. px += 3u;
  351. /* Perform the multiply-accumulates */
  352. acc0 = __SMLADX(x1, c0, acc0);
  353. acc1 = __SMLAD(x2, c0, acc1);
  354. acc2 = __SMLADX(x2, c0, acc2);
  355. acc3 = __SMLADX(x3, c0, acc3);
  356. }
  357. /* Store the results in the accumulators in the destination buffer. */
  358. #ifndef ARM_MATH_BIG_ENDIAN
  359. *__SIMD32(pOut)++ = __PKHBT((acc0 >> 15), (acc1 >> 15), 16);
  360. *__SIMD32(pOut)++ = __PKHBT((acc2 >> 15), (acc3 >> 15), 16);
  361. #else
  362. *__SIMD32(pOut)++ = __PKHBT((acc1 >> 15), (acc0 >> 15), 16);
  363. *__SIMD32(pOut)++ = __PKHBT((acc3 >> 15), (acc2 >> 15), 16);
  364. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  365. /* Increment the pointer pIn1 index, count by 4 */
  366. count += 4u;
  367. /* Update the inputA and inputB pointers for next MAC calculation */
  368. px = pIn1 + count;
  369. py = pSrc2;
  370. /* Decrement the loop counter */
  371. blkCnt--;
  372. }
  373. /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
  374. ** No loop unrolling is used. */
  375. blkCnt = blockSize2 % 0x4u;
  376. while(blkCnt > 0u)
  377. {
  378. /* Accumulator is made zero for every iteration */
  379. sum = 0;
  380. /* Apply loop unrolling and compute 4 MACs simultaneously. */
  381. k = srcBLen >> 2u;
  382. /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
  383. ** a second loop below computes MACs for the remaining 1 to 3 samples. */
  384. while(k > 0u)
  385. {
  386. /* Perform the multiply-accumulates */
  387. sum += ((q31_t) * px++ * *py--);
  388. sum += ((q31_t) * px++ * *py--);
  389. sum += ((q31_t) * px++ * *py--);
  390. sum += ((q31_t) * px++ * *py--);
  391. /* Decrement the loop counter */
  392. k--;
  393. }
  394. /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
  395. ** No loop unrolling is used. */
  396. k = srcBLen % 0x4u;
  397. while(k > 0u)
  398. {
  399. /* Perform the multiply-accumulates */
  400. sum += ((q31_t) * px++ * *py--);
  401. /* Decrement the loop counter */
  402. k--;
  403. }
  404. /* Store the result in the accumulator in the destination buffer. */
  405. *pOut++ = (q15_t) (sum >> 15);
  406. /* Increment the pointer pIn1 index, count by 1 */
  407. count++;
  408. /* Update the inputA and inputB pointers for next MAC calculation */
  409. px = pIn1 + count;
  410. py = pSrc2;
  411. /* Decrement the loop counter */
  412. blkCnt--;
  413. }
  414. }
  415. else
  416. {
  417. /* If the srcBLen is not a multiple of 4,
  418. * the blockSize2 loop cannot be unrolled by 4 */
  419. blkCnt = blockSize2;
  420. while(blkCnt > 0u)
  421. {
  422. /* Accumulator is made zero for every iteration */
  423. sum = 0;
  424. /* srcBLen number of MACS should be performed */
  425. k = srcBLen;
  426. while(k > 0u)
  427. {
  428. /* Perform the multiply-accumulate */
  429. sum += ((q31_t) * px++ * *py--);
  430. /* Decrement the loop counter */
  431. k--;
  432. }
  433. /* Store the result in the accumulator in the destination buffer. */
  434. *pOut++ = (q15_t) (sum >> 15);
  435. /* Increment the MAC count */
  436. count++;
  437. /* Update the inputA and inputB pointers for next MAC calculation */
  438. px = pIn1 + count;
  439. py = pSrc2;
  440. /* Decrement the loop counter */
  441. blkCnt--;
  442. }
  443. }
  444. /* --------------------------
  445. * Initializations of stage3
  446. * -------------------------*/
  447. /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
  448. * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
  449. * ....
  450. * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
  451. * sum += x[srcALen-1] * y[srcBLen-1]
  452. */
  453. /* In this stage the MAC operations are decreased by 1 for every iteration.
  454. The blockSize3 variable holds the number of MAC operations performed */
  455. /* Working pointer of inputA */
  456. pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
  457. px = pSrc1;
  458. /* Working pointer of inputB */
  459. pSrc2 = pIn2 + (srcBLen - 1u);
  460. pIn2 = pSrc2 - 1u;
  461. py = pIn2;
  462. /* -------------------
  463. * Stage3 process
  464. * ------------------*/
  465. /* For loop unrolling by 4, this stage is divided into two. */
  466. /* First part of this stage computes the MAC operations greater than 4 */
  467. /* Second part of this stage computes the MAC operations less than or equal to 4 */
  468. /* The first part of the stage starts here */
  469. j = blockSize3 >> 2u;
  470. while((j > 0u) && (blockSize3 > 0u))
  471. {
  472. /* Accumulator is made zero for every iteration */
  473. sum = 0;
  474. /* Apply loop unrolling and compute 4 MACs simultaneously. */
  475. k = blockSize3 >> 2u;
  476. /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
  477. ** a second loop below computes MACs for the remaining 1 to 3 samples. */
  478. while(k > 0u)
  479. {
  480. /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
  481. * with y[srcBLen - 1], y[srcBLen - 2] respectively */
  482. sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
  483. /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
  484. * with y[srcBLen - 3], y[srcBLen - 4] respectively */
  485. sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
  486. /* Decrement the loop counter */
  487. k--;
  488. }
  489. /* For the next MAC operations, the pointer py is used without SIMD
  490. * So, py is incremented by 1 */
  491. py = py + 1u;
  492. /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
  493. ** No loop unrolling is used. */
  494. k = blockSize3 % 0x4u;
  495. while(k > 0u)
  496. {
  497. /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
  498. sum = __SMLAD(*px++, *py--, sum);
  499. /* Decrement the loop counter */
  500. k--;
  501. }
  502. /* Store the result in the accumulator in the destination buffer. */
  503. *pOut++ = (q15_t) (sum >> 15);
  504. /* Update the inputA and inputB pointers for next MAC calculation */
  505. px = ++pSrc1;
  506. py = pIn2;
  507. /* Decrement the loop counter */
  508. blockSize3--;
  509. j--;
  510. }
  511. /* The second part of the stage starts here */
  512. /* SIMD is not used for the next MAC operations,
  513. * so pointer py is updated to read only one sample at a time */
  514. py = py + 1u;
  515. while(blockSize3 > 0u)
  516. {
  517. /* Accumulator is made zero for every iteration */
  518. sum = 0;
  519. /* Apply loop unrolling and compute 4 MACs simultaneously. */
  520. k = blockSize3;
  521. while(k > 0u)
  522. {
  523. /* Perform the multiply-accumulates */
  524. /* sum += x[srcALen-1] * y[srcBLen-1] */
  525. sum = __SMLAD(*px++, *py--, sum);
  526. /* Decrement the loop counter */
  527. k--;
  528. }
  529. /* Store the result in the accumulator in the destination buffer. */
  530. *pOut++ = (q15_t) (sum >> 15);
  531. /* Update the inputA and inputB pointers for next MAC calculation */
  532. px = ++pSrc1;
  533. py = pSrc2;
  534. /* Decrement the loop counter */
  535. blockSize3--;
  536. }
  537. #else
  538. q15_t *pIn1; /* inputA pointer */
  539. q15_t *pIn2; /* inputB pointer */
  540. q15_t *pOut = pDst; /* output pointer */
  541. q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
  542. q15_t *px; /* Intermediate inputA pointer */
  543. q15_t *py; /* Intermediate inputB pointer */
  544. q15_t *pSrc1, *pSrc2; /* Intermediate pointers */
  545. q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
  546. uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt; /* loop counter */
  547. q15_t a, b;
  548. /* The algorithm implementation is based on the lengths of the inputs. */
  549. /* srcB is always made to slide across srcA. */
  550. /* So srcBLen is always considered as shorter or equal to srcALen */
  551. if(srcALen >= srcBLen)
  552. {
  553. /* Initialization of inputA pointer */
  554. pIn1 = pSrcA;
  555. /* Initialization of inputB pointer */
  556. pIn2 = pSrcB;
  557. }
  558. else
  559. {
  560. /* Initialization of inputA pointer */
  561. pIn1 = pSrcB;
  562. /* Initialization of inputB pointer */
  563. pIn2 = pSrcA;
  564. /* srcBLen is always considered as shorter or equal to srcALen */
  565. j = srcBLen;
  566. srcBLen = srcALen;
  567. srcALen = j;
  568. }
  569. /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
  570. /* The function is internally
  571. * divided into three stages according to the number of multiplications that has to be
  572. * taken place between inputA samples and inputB samples. In the first stage of the
  573. * algorithm, the multiplications increase by one for every iteration.
  574. * In the second stage of the algorithm, srcBLen number of multiplications are done.
  575. * In the third stage of the algorithm, the multiplications decrease by one
  576. * for every iteration. */
  577. /* The algorithm is implemented in three stages.
  578. The loop counters of each stage is initiated here. */
  579. blockSize1 = srcBLen - 1u;
  580. blockSize2 = srcALen - (srcBLen - 1u);
  581. blockSize3 = blockSize1;
  582. /* --------------------------
  583. * Initializations of stage1
  584. * -------------------------*/
  585. /* sum = x[0] * y[0]
  586. * sum = x[0] * y[1] + x[1] * y[0]
  587. * ....
  588. * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
  589. */
  590. /* In this stage the MAC operations are increased by 1 for every iteration.
  591. The count variable holds the number of MAC operations performed */
  592. count = 1u;
  593. /* Working pointer of inputA */
  594. px = pIn1;
  595. /* Working pointer of inputB */
  596. py = pIn2;
  597. /* ------------------------
  598. * Stage1 process
  599. * ----------------------*/
  600. /* For loop unrolling by 4, this stage is divided into two. */
  601. /* First part of this stage computes the MAC operations less than 4 */
  602. /* Second part of this stage computes the MAC operations greater than or equal to 4 */
  603. /* The first part of the stage starts here */
  604. while((count < 4u) && (blockSize1 > 0u))
  605. {
  606. /* Accumulator is made zero for every iteration */
  607. sum = 0;
  608. /* Loop over number of MAC operations between
  609. * inputA samples and inputB samples */
  610. k = count;
  611. while(k > 0u)
  612. {
  613. /* Perform the multiply-accumulates */
  614. sum += ((q31_t) * px++ * *py--);
  615. /* Decrement the loop counter */
  616. k--;
  617. }
  618. /* Store the result in the accumulator in the destination buffer. */
  619. *pOut++ = (q15_t) (sum >> 15);
  620. /* Update the inputA and inputB pointers for next MAC calculation */
  621. py = pIn2 + count;
  622. px = pIn1;
  623. /* Increment the MAC count */
  624. count++;
  625. /* Decrement the loop counter */
  626. blockSize1--;
  627. }
  628. /* The second part of the stage starts here */
  629. /* The internal loop, over count, is unrolled by 4 */
  630. /* To, read the last two inputB samples using SIMD:
  631. * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
  632. py = py - 1;
  633. while(blockSize1 > 0u)
  634. {
  635. /* Accumulator is made zero for every iteration */
  636. sum = 0;
  637. /* Apply loop unrolling and compute 4 MACs simultaneously. */
  638. k = count >> 2u;
  639. /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
  640. ** a second loop below computes MACs for the remaining 1 to 3 samples. */
  641. py++;
  642. while(k > 0u)
  643. {
  644. /* Perform the multiply-accumulates */
  645. sum += ((q31_t) * px++ * *py--);
  646. sum += ((q31_t) * px++ * *py--);
  647. sum += ((q31_t) * px++ * *py--);
  648. sum += ((q31_t) * px++ * *py--);
  649. /* Decrement the loop counter */
  650. k--;
  651. }
  652. /* If the count is not a multiple of 4, compute any remaining MACs here.
  653. ** No loop unrolling is used. */
  654. k = count % 0x4u;
  655. while(k > 0u)
  656. {
  657. /* Perform the multiply-accumulates */
  658. sum += ((q31_t) * px++ * *py--);
  659. /* Decrement the loop counter */
  660. k--;
  661. }
  662. /* Store the result in the accumulator in the destination buffer. */
  663. *pOut++ = (q15_t) (sum >> 15);
  664. /* Update the inputA and inputB pointers for next MAC calculation */
  665. py = pIn2 + (count - 1u);
  666. px = pIn1;
  667. /* Increment the MAC count */
  668. count++;
  669. /* Decrement the loop counter */
  670. blockSize1--;
  671. }
  672. /* --------------------------
  673. * Initializations of stage2
  674. * ------------------------*/
  675. /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
  676. * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
  677. * ....
  678. * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
  679. */
  680. /* Working pointer of inputA */
  681. px = pIn1;
  682. /* Working pointer of inputB */
  683. pSrc2 = pIn2 + (srcBLen - 1u);
  684. py = pSrc2;
  685. /* count is the index by which the pointer pIn1 to be incremented */
  686. count = 0u;
  687. /* --------------------
  688. * Stage2 process
  689. * -------------------*/
  690. /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
  691. * So, to loop unroll over blockSize2,
  692. * srcBLen should be greater than or equal to 4 */
  693. if(srcBLen >= 4u)
  694. {
  695. /* Loop unroll over blockSize2, by 4 */
  696. blkCnt = blockSize2 >> 2u;
  697. while(blkCnt > 0u)
  698. {
  699. py = py - 1u;
  700. /* Set all accumulators to zero */
  701. acc0 = 0;
  702. acc1 = 0;
  703. acc2 = 0;
  704. acc3 = 0;
  705. /* read x[0], x[1] samples */
  706. a = *px++;
  707. b = *px++;
  708. #ifndef ARM_MATH_BIG_ENDIAN
  709. x0 = __PKHBT(a, b, 16);
  710. a = *px;
  711. x1 = __PKHBT(b, a, 16);
  712. #else
  713. x0 = __PKHBT(b, a, 16);
  714. a = *px;
  715. x1 = __PKHBT(a, b, 16);
  716. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  717. /* Apply loop unrolling and compute 4 MACs simultaneously. */
  718. k = srcBLen >> 2u;
  719. /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
  720. ** a second loop below computes MACs for the remaining 1 to 3 samples. */
  721. do
  722. {
  723. /* Read the last two inputB samples using SIMD:
  724. * y[srcBLen - 1] and y[srcBLen - 2] */
  725. a = *py;
  726. b = *(py+1);
  727. py -= 2;
  728. #ifndef ARM_MATH_BIG_ENDIAN
  729. c0 = __PKHBT(a, b, 16);
  730. #else
  731. c0 = __PKHBT(b, a, 16);;
  732. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  733. /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
  734. acc0 = __SMLADX(x0, c0, acc0);
  735. /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
  736. acc1 = __SMLADX(x1, c0, acc1);
  737. a = *px;
  738. b = *(px + 1);
  739. #ifndef ARM_MATH_BIG_ENDIAN
  740. x2 = __PKHBT(a, b, 16);
  741. a = *(px + 2);
  742. x3 = __PKHBT(b, a, 16);
  743. #else
  744. x2 = __PKHBT(b, a, 16);
  745. a = *(px + 2);
  746. x3 = __PKHBT(a, b, 16);
  747. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  748. /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
  749. acc2 = __SMLADX(x2, c0, acc2);
  750. /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
  751. acc3 = __SMLADX(x3, c0, acc3);
  752. /* Read y[srcBLen - 3] and y[srcBLen - 4] */
  753. a = *py;
  754. b = *(py+1);
  755. py -= 2;
  756. #ifndef ARM_MATH_BIG_ENDIAN
  757. c0 = __PKHBT(a, b, 16);
  758. #else
  759. c0 = __PKHBT(b, a, 16);;
  760. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  761. /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
  762. acc0 = __SMLADX(x2, c0, acc0);
  763. /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
  764. acc1 = __SMLADX(x3, c0, acc1);
  765. /* Read x[4], x[5], x[6] */
  766. a = *(px + 2);
  767. b = *(px + 3);
  768. #ifndef ARM_MATH_BIG_ENDIAN
  769. x0 = __PKHBT(a, b, 16);
  770. a = *(px + 4);
  771. x1 = __PKHBT(b, a, 16);
  772. #else
  773. x0 = __PKHBT(b, a, 16);
  774. a = *(px + 4);
  775. x1 = __PKHBT(a, b, 16);
  776. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  777. px += 4u;
  778. /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
  779. acc2 = __SMLADX(x0, c0, acc2);
  780. /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
  781. acc3 = __SMLADX(x1, c0, acc3);
  782. } while(--k);
  783. /* For the next MAC operations, SIMD is not used
  784. * So, the 16 bit pointer if inputB, py is updated */
  785. /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
  786. ** No loop unrolling is used. */
  787. k = srcBLen % 0x4u;
  788. if(k == 1u)
  789. {
  790. /* Read y[srcBLen - 5] */
  791. c0 = *(py+1);
  792. #ifdef ARM_MATH_BIG_ENDIAN
  793. c0 = c0 << 16u;
  794. #else
  795. c0 = c0 & 0x0000FFFF;
  796. #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
  797. /* Read x[7] */
  798. a = *px;
  799. b = *(px+1);
  800. px++;
  801. #ifndef ARM_MATH_BIG_ENDIAN
  802. x3 = __PKHBT(a, b, 16);
  803. #else
  804. x3 = __PKHBT(b, a, 16);;
  805. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  806. /* Perform the multiply-accumulates */
  807. acc0 = __SMLAD(x0, c0, acc0);
  808. acc1 = __SMLAD(x1, c0, acc1);
  809. acc2 = __SMLADX(x1, c0, acc2);
  810. acc3 = __SMLADX(x3, c0, acc3);
  811. }
  812. if(k == 2u)
  813. {
  814. /* Read y[srcBLen - 5], y[srcBLen - 6] */
  815. a = *py;
  816. b = *(py+1);
  817. #ifndef ARM_MATH_BIG_ENDIAN
  818. c0 = __PKHBT(a, b, 16);
  819. #else
  820. c0 = __PKHBT(b, a, 16);;
  821. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  822. /* Read x[7], x[8], x[9] */
  823. a = *px;
  824. b = *(px + 1);
  825. #ifndef ARM_MATH_BIG_ENDIAN
  826. x3 = __PKHBT(a, b, 16);
  827. a = *(px + 2);
  828. x2 = __PKHBT(b, a, 16);
  829. #else
  830. x3 = __PKHBT(b, a, 16);
  831. a = *(px + 2);
  832. x2 = __PKHBT(a, b, 16);
  833. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  834. px += 2u;
  835. /* Perform the multiply-accumulates */
  836. acc0 = __SMLADX(x0, c0, acc0);
  837. acc1 = __SMLADX(x1, c0, acc1);
  838. acc2 = __SMLADX(x3, c0, acc2);
  839. acc3 = __SMLADX(x2, c0, acc3);
  840. }
  841. if(k == 3u)
  842. {
  843. /* Read y[srcBLen - 5], y[srcBLen - 6] */
  844. a = *py;
  845. b = *(py+1);
  846. #ifndef ARM_MATH_BIG_ENDIAN
  847. c0 = __PKHBT(a, b, 16);
  848. #else
  849. c0 = __PKHBT(b, a, 16);;
  850. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  851. /* Read x[7], x[8], x[9] */
  852. a = *px;
  853. b = *(px + 1);
  854. #ifndef ARM_MATH_BIG_ENDIAN
  855. x3 = __PKHBT(a, b, 16);
  856. a = *(px + 2);
  857. x2 = __PKHBT(b, a, 16);
  858. #else
  859. x3 = __PKHBT(b, a, 16);
  860. a = *(px + 2);
  861. x2 = __PKHBT(a, b, 16);
  862. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  863. /* Perform the multiply-accumulates */
  864. acc0 = __SMLADX(x0, c0, acc0);
  865. acc1 = __SMLADX(x1, c0, acc1);
  866. acc2 = __SMLADX(x3, c0, acc2);
  867. acc3 = __SMLADX(x2, c0, acc3);
  868. /* Read y[srcBLen - 7] */
  869. c0 = *(py-1);
  870. #ifdef ARM_MATH_BIG_ENDIAN
  871. c0 = c0 << 16u;
  872. #else
  873. c0 = c0 & 0x0000FFFF;
  874. #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
  875. /* Read x[10] */
  876. a = *(px+2);
  877. b = *(px+3);
  878. #ifndef ARM_MATH_BIG_ENDIAN
  879. x3 = __PKHBT(a, b, 16);
  880. #else
  881. x3 = __PKHBT(b, a, 16);;
  882. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  883. px += 3u;
  884. /* Perform the multiply-accumulates */
  885. acc0 = __SMLADX(x1, c0, acc0);
  886. acc1 = __SMLAD(x2, c0, acc1);
  887. acc2 = __SMLADX(x2, c0, acc2);
  888. acc3 = __SMLADX(x3, c0, acc3);
  889. }
  890. /* Store the results in the accumulators in the destination buffer. */
  891. *pOut++ = (q15_t)(acc0 >> 15);
  892. *pOut++ = (q15_t)(acc1 >> 15);
  893. *pOut++ = (q15_t)(acc2 >> 15);
  894. *pOut++ = (q15_t)(acc3 >> 15);
  895. /* Increment the pointer pIn1 index, count by 4 */
  896. count += 4u;
  897. /* Update the inputA and inputB pointers for next MAC calculation */
  898. px = pIn1 + count;
  899. py = pSrc2;
  900. /* Decrement the loop counter */
  901. blkCnt--;
  902. }
  903. /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
  904. ** No loop unrolling is used. */
  905. blkCnt = blockSize2 % 0x4u;
  906. while(blkCnt > 0u)
  907. {
  908. /* Accumulator is made zero for every iteration */
  909. sum = 0;
  910. /* Apply loop unrolling and compute 4 MACs simultaneously. */
  911. k = srcBLen >> 2u;
  912. /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
  913. ** a second loop below computes MACs for the remaining 1 to 3 samples. */
  914. while(k > 0u)
  915. {
  916. /* Perform the multiply-accumulates */
  917. sum += ((q31_t) * px++ * *py--);
  918. sum += ((q31_t) * px++ * *py--);
  919. sum += ((q31_t) * px++ * *py--);
  920. sum += ((q31_t) * px++ * *py--);
  921. /* Decrement the loop counter */
  922. k--;
  923. }
  924. /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
  925. ** No loop unrolling is used. */
  926. k = srcBLen % 0x4u;
  927. while(k > 0u)
  928. {
  929. /* Perform the multiply-accumulates */
  930. sum += ((q31_t) * px++ * *py--);
  931. /* Decrement the loop counter */
  932. k--;
  933. }
  934. /* Store the result in the accumulator in the destination buffer. */
  935. *pOut++ = (q15_t) (sum >> 15);
  936. /* Increment the pointer pIn1 index, count by 1 */
  937. count++;
  938. /* Update the inputA and inputB pointers for next MAC calculation */
  939. px = pIn1 + count;
  940. py = pSrc2;
  941. /* Decrement the loop counter */
  942. blkCnt--;
  943. }
  944. }
  945. else
  946. {
  947. /* If the srcBLen is not a multiple of 4,
  948. * the blockSize2 loop cannot be unrolled by 4 */
  949. blkCnt = blockSize2;
  950. while(blkCnt > 0u)
  951. {
  952. /* Accumulator is made zero for every iteration */
  953. sum = 0;
  954. /* srcBLen number of MACS should be performed */
  955. k = srcBLen;
  956. while(k > 0u)
  957. {
  958. /* Perform the multiply-accumulate */
  959. sum += ((q31_t) * px++ * *py--);
  960. /* Decrement the loop counter */
  961. k--;
  962. }
  963. /* Store the result in the accumulator in the destination buffer. */
  964. *pOut++ = (q15_t) (sum >> 15);
  965. /* Increment the MAC count */
  966. count++;
  967. /* Update the inputA and inputB pointers for next MAC calculation */
  968. px = pIn1 + count;
  969. py = pSrc2;
  970. /* Decrement the loop counter */
  971. blkCnt--;
  972. }
  973. }
  974. /* --------------------------
  975. * Initializations of stage3
  976. * -------------------------*/
  977. /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
  978. * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
  979. * ....
  980. * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
  981. * sum += x[srcALen-1] * y[srcBLen-1]
  982. */
  983. /* In this stage the MAC operations are decreased by 1 for every iteration.
  984. The blockSize3 variable holds the number of MAC operations performed */
  985. /* Working pointer of inputA */
  986. pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
  987. px = pSrc1;
  988. /* Working pointer of inputB */
  989. pSrc2 = pIn2 + (srcBLen - 1u);
  990. pIn2 = pSrc2 - 1u;
  991. py = pIn2;
  992. /* -------------------
  993. * Stage3 process
  994. * ------------------*/
  995. /* For loop unrolling by 4, this stage is divided into two. */
  996. /* First part of this stage computes the MAC operations greater than 4 */
  997. /* Second part of this stage computes the MAC operations less than or equal to 4 */
  998. /* The first part of the stage starts here */
  999. j = blockSize3 >> 2u;
  1000. while((j > 0u) && (blockSize3 > 0u))
  1001. {
  1002. /* Accumulator is made zero for every iteration */
  1003. sum = 0;
  1004. /* Apply loop unrolling and compute 4 MACs simultaneously. */
  1005. k = blockSize3 >> 2u;
  1006. /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
  1007. ** a second loop below computes MACs for the remaining 1 to 3 samples. */
  1008. py++;
  1009. while(k > 0u)
  1010. {
  1011. sum += ((q31_t) * px++ * *py--);
  1012. sum += ((q31_t) * px++ * *py--);
  1013. sum += ((q31_t) * px++ * *py--);
  1014. sum += ((q31_t) * px++ * *py--);
  1015. /* Decrement the loop counter */
  1016. k--;
  1017. }
  1018. /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
  1019. ** No loop unrolling is used. */
  1020. k = blockSize3 % 0x4u;
  1021. while(k > 0u)
  1022. {
  1023. /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
  1024. sum += ((q31_t) * px++ * *py--);
  1025. /* Decrement the loop counter */
  1026. k--;
  1027. }
  1028. /* Store the result in the accumulator in the destination buffer. */
  1029. *pOut++ = (q15_t) (sum >> 15);
  1030. /* Update the inputA and inputB pointers for next MAC calculation */
  1031. px = ++pSrc1;
  1032. py = pIn2;
  1033. /* Decrement the loop counter */
  1034. blockSize3--;
  1035. j--;
  1036. }
  1037. /* The second part of the stage starts here */
  1038. /* SIMD is not used for the next MAC operations,
  1039. * so pointer py is updated to read only one sample at a time */
  1040. py = py + 1u;
  1041. while(blockSize3 > 0u)
  1042. {
  1043. /* Accumulator is made zero for every iteration */
  1044. sum = 0;
  1045. /* Apply loop unrolling and compute 4 MACs simultaneously. */
  1046. k = blockSize3;
  1047. while(k > 0u)
  1048. {
  1049. /* Perform the multiply-accumulates */
  1050. /* sum += x[srcALen-1] * y[srcBLen-1] */
  1051. sum += ((q31_t) * px++ * *py--);
  1052. /* Decrement the loop counter */
  1053. k--;
  1054. }
  1055. /* Store the result in the accumulator in the destination buffer. */
  1056. *pOut++ = (q15_t) (sum >> 15);
  1057. /* Update the inputA and inputB pointers for next MAC calculation */
  1058. px = ++pSrc1;
  1059. py = pSrc2;
  1060. /* Decrement the loop counter */
  1061. blockSize3--;
  1062. }
  1063. #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
  1064. }
  1065. /**
  1066. * @} end of Conv group
  1067. */