Keyboard firmwares for Atmel AVR and Cortex-M
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

arm_conv_partial_fast_opt_q15.c 21KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768
  1. /* ----------------------------------------------------------------------
  2. * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
  3. *
  4. * $Date: 17. January 2013
  5. * $Revision: V1.4.1
  6. *
  7. * Project: CMSIS DSP Library
  8. * Title: arm_conv_partial_fast_opt_q15.c
  9. *
  10. * Description: Fast Q15 Partial convolution.
  11. *
  12. * Target Processor: Cortex-M4/Cortex-M3
  13. *
  14. * Redistribution and use in source and binary forms, with or without
  15. * modification, are permitted provided that the following conditions
  16. * are met:
  17. * - Redistributions of source code must retain the above copyright
  18. * notice, this list of conditions and the following disclaimer.
  19. * - Redistributions in binary form must reproduce the above copyright
  20. * notice, this list of conditions and the following disclaimer in
  21. * the documentation and/or other materials provided with the
  22. * distribution.
  23. * - Neither the name of ARM LIMITED nor the names of its contributors
  24. * may be used to endorse or promote products derived from this
  25. * software without specific prior written permission.
  26. *
  27. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  30. * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  31. * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  32. * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  33. * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  34. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  35. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  36. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  37. * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38. * POSSIBILITY OF SUCH DAMAGE.
  39. * -------------------------------------------------------------------- */
  40. #include "arm_math.h"
  41. /**
  42. * @ingroup groupFilters
  43. */
  44. /**
  45. * @addtogroup PartialConv
  46. * @{
  47. */
  48. /**
  49. * @brief Partial convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.
  50. * @param[in] *pSrcA points to the first input sequence.
  51. * @param[in] srcALen length of the first input sequence.
  52. * @param[in] *pSrcB points to the second input sequence.
  53. * @param[in] srcBLen length of the second input sequence.
  54. * @param[out] *pDst points to the location where the output result is written.
  55. * @param[in] firstIndex is the first output sample to start with.
  56. * @param[in] numPoints is the number of output points to be computed.
  57. * @param[in] *pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
  58. * @param[in] *pScratch2 points to scratch buffer of size min(srcALen, srcBLen).
  59. * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
  60. *
  61. * See <code>arm_conv_partial_q15()</code> for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion.
  62. *
  63. * \par Restrictions
  64. * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
  65. * In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit
  66. *
  67. */
  68. #ifndef UNALIGNED_SUPPORT_DISABLE
  69. arm_status arm_conv_partial_fast_opt_q15(
  70. q15_t * pSrcA,
  71. uint32_t srcALen,
  72. q15_t * pSrcB,
  73. uint32_t srcBLen,
  74. q15_t * pDst,
  75. uint32_t firstIndex,
  76. uint32_t numPoints,
  77. q15_t * pScratch1,
  78. q15_t * pScratch2)
  79. {
  80. q15_t *pOut = pDst; /* output pointer */
  81. q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */
  82. q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */
  83. q31_t acc0, acc1, acc2, acc3; /* Accumulator */
  84. q31_t x1, x2, x3; /* Temporary variables to hold state and coefficient values */
  85. q31_t y1, y2; /* State variables */
  86. q15_t *pIn1; /* inputA pointer */
  87. q15_t *pIn2; /* inputB pointer */
  88. q15_t *px; /* Intermediate inputA pointer */
  89. q15_t *py; /* Intermediate inputB pointer */
  90. uint32_t j, k, blkCnt; /* loop counter */
  91. arm_status status;
  92. uint32_t tapCnt; /* loop count */
  93. /* Check for range of output samples to be calculated */
  94. if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
  95. {
  96. /* Set status as ARM_MATH_ARGUMENT_ERROR */
  97. status = ARM_MATH_ARGUMENT_ERROR;
  98. }
  99. else
  100. {
  101. /* The algorithm implementation is based on the lengths of the inputs. */
  102. /* srcB is always made to slide across srcA. */
  103. /* So srcBLen is always considered as shorter or equal to srcALen */
  104. if(srcALen >= srcBLen)
  105. {
  106. /* Initialization of inputA pointer */
  107. pIn1 = pSrcA;
  108. /* Initialization of inputB pointer */
  109. pIn2 = pSrcB;
  110. }
  111. else
  112. {
  113. /* Initialization of inputA pointer */
  114. pIn1 = pSrcB;
  115. /* Initialization of inputB pointer */
  116. pIn2 = pSrcA;
  117. /* srcBLen is always considered as shorter or equal to srcALen */
  118. j = srcBLen;
  119. srcBLen = srcALen;
  120. srcALen = j;
  121. }
  122. /* Temporary pointer for scratch2 */
  123. py = pScratch2;
  124. /* pointer to take end of scratch2 buffer */
  125. pScr2 = pScratch2 + srcBLen - 1;
  126. /* points to smaller length sequence */
  127. px = pIn2;
  128. /* Apply loop unrolling and do 4 Copies simultaneously. */
  129. k = srcBLen >> 2u;
  130. /* First part of the processing with loop unrolling copies 4 data points at a time.
  131. ** a second loop below copies for the remaining 1 to 3 samples. */
  132. /* Copy smaller length input sequence in reverse order into second scratch buffer */
  133. while(k > 0u)
  134. {
  135. /* copy second buffer in reversal manner */
  136. *pScr2-- = *px++;
  137. *pScr2-- = *px++;
  138. *pScr2-- = *px++;
  139. *pScr2-- = *px++;
  140. /* Decrement the loop counter */
  141. k--;
  142. }
  143. /* If the count is not a multiple of 4, copy remaining samples here.
  144. ** No loop unrolling is used. */
  145. k = srcBLen % 0x4u;
  146. while(k > 0u)
  147. {
  148. /* copy second buffer in reversal manner for remaining samples */
  149. *pScr2-- = *px++;
  150. /* Decrement the loop counter */
  151. k--;
  152. }
  153. /* Initialze temporary scratch pointer */
  154. pScr1 = pScratch1;
  155. /* Assuming scratch1 buffer is aligned by 32-bit */
  156. /* Fill (srcBLen - 1u) zeros in scratch buffer */
  157. arm_fill_q15(0, pScr1, (srcBLen - 1u));
  158. /* Update temporary scratch pointer */
  159. pScr1 += (srcBLen - 1u);
  160. /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */
  161. /* Copy (srcALen) samples in scratch buffer */
  162. arm_copy_q15(pIn1, pScr1, srcALen);
  163. /* Update pointers */
  164. pScr1 += srcALen;
  165. /* Fill (srcBLen - 1u) zeros at end of scratch buffer */
  166. arm_fill_q15(0, pScr1, (srcBLen - 1u));
  167. /* Update pointer */
  168. pScr1 += (srcBLen - 1u);
  169. /* Initialization of pIn2 pointer */
  170. pIn2 = py;
  171. pScratch1 += firstIndex;
  172. pOut = pDst + firstIndex;
  173. /* First part of the processing with loop unrolling process 4 data points at a time.
  174. ** a second loop below process for the remaining 1 to 3 samples. */
  175. /* Actual convolution process starts here */
  176. blkCnt = (numPoints) >> 2;
  177. while(blkCnt > 0)
  178. {
  179. /* Initialze temporary scratch pointer as scratch1 */
  180. pScr1 = pScratch1;
  181. /* Clear Accumlators */
  182. acc0 = 0;
  183. acc1 = 0;
  184. acc2 = 0;
  185. acc3 = 0;
  186. /* Read two samples from scratch1 buffer */
  187. x1 = *__SIMD32(pScr1)++;
  188. /* Read next two samples from scratch1 buffer */
  189. x2 = *__SIMD32(pScr1)++;
  190. tapCnt = (srcBLen) >> 2u;
  191. while(tapCnt > 0u)
  192. {
  193. /* Read four samples from smaller buffer */
  194. y1 = _SIMD32_OFFSET(pIn2);
  195. y2 = _SIMD32_OFFSET(pIn2 + 2u);
  196. /* multiply and accumlate */
  197. acc0 = __SMLAD(x1, y1, acc0);
  198. acc2 = __SMLAD(x2, y1, acc2);
  199. /* pack input data */
  200. #ifndef ARM_MATH_BIG_ENDIAN
  201. x3 = __PKHBT(x2, x1, 0);
  202. #else
  203. x3 = __PKHBT(x1, x2, 0);
  204. #endif
  205. /* multiply and accumlate */
  206. acc1 = __SMLADX(x3, y1, acc1);
  207. /* Read next two samples from scratch1 buffer */
  208. x1 = _SIMD32_OFFSET(pScr1);
  209. /* multiply and accumlate */
  210. acc0 = __SMLAD(x2, y2, acc0);
  211. acc2 = __SMLAD(x1, y2, acc2);
  212. /* pack input data */
  213. #ifndef ARM_MATH_BIG_ENDIAN
  214. x3 = __PKHBT(x1, x2, 0);
  215. #else
  216. x3 = __PKHBT(x2, x1, 0);
  217. #endif
  218. acc3 = __SMLADX(x3, y1, acc3);
  219. acc1 = __SMLADX(x3, y2, acc1);
  220. x2 = _SIMD32_OFFSET(pScr1 + 2u);
  221. #ifndef ARM_MATH_BIG_ENDIAN
  222. x3 = __PKHBT(x2, x1, 0);
  223. #else
  224. x3 = __PKHBT(x1, x2, 0);
  225. #endif
  226. acc3 = __SMLADX(x3, y2, acc3);
  227. /* update scratch pointers */
  228. pIn2 += 4u;
  229. pScr1 += 4u;
  230. /* Decrement the loop counter */
  231. tapCnt--;
  232. }
  233. /* Update scratch pointer for remaining samples of smaller length sequence */
  234. pScr1 -= 4u;
  235. /* apply same above for remaining samples of smaller length sequence */
  236. tapCnt = (srcBLen) & 3u;
  237. while(tapCnt > 0u)
  238. {
  239. /* accumlate the results */
  240. acc0 += (*pScr1++ * *pIn2);
  241. acc1 += (*pScr1++ * *pIn2);
  242. acc2 += (*pScr1++ * *pIn2);
  243. acc3 += (*pScr1++ * *pIn2++);
  244. pScr1 -= 3u;
  245. /* Decrement the loop counter */
  246. tapCnt--;
  247. }
  248. blkCnt--;
  249. /* Store the results in the accumulators in the destination buffer. */
  250. #ifndef ARM_MATH_BIG_ENDIAN
  251. *__SIMD32(pOut)++ =
  252. __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
  253. *__SIMD32(pOut)++ =
  254. __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
  255. #else
  256. *__SIMD32(pOut)++ =
  257. __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
  258. *__SIMD32(pOut)++ =
  259. __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
  260. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  261. /* Initialization of inputB pointer */
  262. pIn2 = py;
  263. pScratch1 += 4u;
  264. }
  265. blkCnt = numPoints & 0x3;
  266. /* Calculate convolution for remaining samples of Bigger length sequence */
  267. while(blkCnt > 0)
  268. {
  269. /* Initialze temporary scratch pointer as scratch1 */
  270. pScr1 = pScratch1;
  271. /* Clear Accumlators */
  272. acc0 = 0;
  273. tapCnt = (srcBLen) >> 1u;
  274. while(tapCnt > 0u)
  275. {
  276. /* Read next two samples from scratch1 buffer */
  277. x1 = *__SIMD32(pScr1)++;
  278. /* Read two samples from smaller buffer */
  279. y1 = *__SIMD32(pIn2)++;
  280. acc0 = __SMLAD(x1, y1, acc0);
  281. /* Decrement the loop counter */
  282. tapCnt--;
  283. }
  284. tapCnt = (srcBLen) & 1u;
  285. /* apply same above for remaining samples of smaller length sequence */
  286. while(tapCnt > 0u)
  287. {
  288. /* accumlate the results */
  289. acc0 += (*pScr1++ * *pIn2++);
  290. /* Decrement the loop counter */
  291. tapCnt--;
  292. }
  293. blkCnt--;
  294. /* The result is in 2.30 format. Convert to 1.15 with saturation.
  295. ** Then store the output in the destination buffer. */
  296. *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16));
  297. /* Initialization of inputB pointer */
  298. pIn2 = py;
  299. pScratch1 += 1u;
  300. }
  301. /* set status as ARM_MATH_SUCCESS */
  302. status = ARM_MATH_SUCCESS;
  303. }
  304. /* Return to application */
  305. return (status);
  306. }
  307. #else
  308. arm_status arm_conv_partial_fast_opt_q15(
  309. q15_t * pSrcA,
  310. uint32_t srcALen,
  311. q15_t * pSrcB,
  312. uint32_t srcBLen,
  313. q15_t * pDst,
  314. uint32_t firstIndex,
  315. uint32_t numPoints,
  316. q15_t * pScratch1,
  317. q15_t * pScratch2)
  318. {
  319. q15_t *pOut = pDst; /* output pointer */
  320. q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */
  321. q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */
  322. q31_t acc0, acc1, acc2, acc3; /* Accumulator */
  323. q15_t *pIn1; /* inputA pointer */
  324. q15_t *pIn2; /* inputB pointer */
  325. q15_t *px; /* Intermediate inputA pointer */
  326. q15_t *py; /* Intermediate inputB pointer */
  327. uint32_t j, k, blkCnt; /* loop counter */
  328. arm_status status; /* Status variable */
  329. uint32_t tapCnt; /* loop count */
  330. q15_t x10, x11, x20, x21; /* Temporary variables to hold srcA buffer */
  331. q15_t y10, y11; /* Temporary variables to hold srcB buffer */
  332. /* Check for range of output samples to be calculated */
  333. if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
  334. {
  335. /* Set status as ARM_MATH_ARGUMENT_ERROR */
  336. status = ARM_MATH_ARGUMENT_ERROR;
  337. }
  338. else
  339. {
  340. /* The algorithm implementation is based on the lengths of the inputs. */
  341. /* srcB is always made to slide across srcA. */
  342. /* So srcBLen is always considered as shorter or equal to srcALen */
  343. if(srcALen >= srcBLen)
  344. {
  345. /* Initialization of inputA pointer */
  346. pIn1 = pSrcA;
  347. /* Initialization of inputB pointer */
  348. pIn2 = pSrcB;
  349. }
  350. else
  351. {
  352. /* Initialization of inputA pointer */
  353. pIn1 = pSrcB;
  354. /* Initialization of inputB pointer */
  355. pIn2 = pSrcA;
  356. /* srcBLen is always considered as shorter or equal to srcALen */
  357. j = srcBLen;
  358. srcBLen = srcALen;
  359. srcALen = j;
  360. }
  361. /* Temporary pointer for scratch2 */
  362. py = pScratch2;
  363. /* pointer to take end of scratch2 buffer */
  364. pScr2 = pScratch2 + srcBLen - 1;
  365. /* points to smaller length sequence */
  366. px = pIn2;
  367. /* Apply loop unrolling and do 4 Copies simultaneously. */
  368. k = srcBLen >> 2u;
  369. /* First part of the processing with loop unrolling copies 4 data points at a time.
  370. ** a second loop below copies for the remaining 1 to 3 samples. */
  371. while(k > 0u)
  372. {
  373. /* copy second buffer in reversal manner */
  374. *pScr2-- = *px++;
  375. *pScr2-- = *px++;
  376. *pScr2-- = *px++;
  377. *pScr2-- = *px++;
  378. /* Decrement the loop counter */
  379. k--;
  380. }
  381. /* If the count is not a multiple of 4, copy remaining samples here.
  382. ** No loop unrolling is used. */
  383. k = srcBLen % 0x4u;
  384. while(k > 0u)
  385. {
  386. /* copy second buffer in reversal manner for remaining samples */
  387. *pScr2-- = *px++;
  388. /* Decrement the loop counter */
  389. k--;
  390. }
  391. /* Initialze temporary scratch pointer */
  392. pScr1 = pScratch1;
  393. /* Fill (srcBLen - 1u) zeros in scratch buffer */
  394. arm_fill_q15(0, pScr1, (srcBLen - 1u));
  395. /* Update temporary scratch pointer */
  396. pScr1 += (srcBLen - 1u);
  397. /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */
  398. /* Apply loop unrolling and do 4 Copies simultaneously. */
  399. k = srcALen >> 2u;
  400. /* First part of the processing with loop unrolling copies 4 data points at a time.
  401. ** a second loop below copies for the remaining 1 to 3 samples. */
  402. while(k > 0u)
  403. {
  404. /* copy second buffer in reversal manner */
  405. *pScr1++ = *pIn1++;
  406. *pScr1++ = *pIn1++;
  407. *pScr1++ = *pIn1++;
  408. *pScr1++ = *pIn1++;
  409. /* Decrement the loop counter */
  410. k--;
  411. }
  412. /* If the count is not a multiple of 4, copy remaining samples here.
  413. ** No loop unrolling is used. */
  414. k = srcALen % 0x4u;
  415. while(k > 0u)
  416. {
  417. /* copy second buffer in reversal manner for remaining samples */
  418. *pScr1++ = *pIn1++;
  419. /* Decrement the loop counter */
  420. k--;
  421. }
  422. /* Apply loop unrolling and do 4 Copies simultaneously. */
  423. k = (srcBLen - 1u) >> 2u;
  424. /* First part of the processing with loop unrolling copies 4 data points at a time.
  425. ** a second loop below copies for the remaining 1 to 3 samples. */
  426. while(k > 0u)
  427. {
  428. /* copy second buffer in reversal manner */
  429. *pScr1++ = 0;
  430. *pScr1++ = 0;
  431. *pScr1++ = 0;
  432. *pScr1++ = 0;
  433. /* Decrement the loop counter */
  434. k--;
  435. }
  436. /* If the count is not a multiple of 4, copy remaining samples here.
  437. ** No loop unrolling is used. */
  438. k = (srcBLen - 1u) % 0x4u;
  439. while(k > 0u)
  440. {
  441. /* copy second buffer in reversal manner for remaining samples */
  442. *pScr1++ = 0;
  443. /* Decrement the loop counter */
  444. k--;
  445. }
  446. /* Initialization of pIn2 pointer */
  447. pIn2 = py;
  448. pScratch1 += firstIndex;
  449. pOut = pDst + firstIndex;
  450. /* Actual convolution process starts here */
  451. blkCnt = (numPoints) >> 2;
  452. while(blkCnt > 0)
  453. {
  454. /* Initialze temporary scratch pointer as scratch1 */
  455. pScr1 = pScratch1;
  456. /* Clear Accumlators */
  457. acc0 = 0;
  458. acc1 = 0;
  459. acc2 = 0;
  460. acc3 = 0;
  461. /* Read two samples from scratch1 buffer */
  462. x10 = *pScr1++;
  463. x11 = *pScr1++;
  464. /* Read next two samples from scratch1 buffer */
  465. x20 = *pScr1++;
  466. x21 = *pScr1++;
  467. tapCnt = (srcBLen) >> 2u;
  468. while(tapCnt > 0u)
  469. {
  470. /* Read two samples from smaller buffer */
  471. y10 = *pIn2;
  472. y11 = *(pIn2 + 1u);
  473. /* multiply and accumlate */
  474. acc0 += (q31_t) x10 *y10;
  475. acc0 += (q31_t) x11 *y11;
  476. acc2 += (q31_t) x20 *y10;
  477. acc2 += (q31_t) x21 *y11;
  478. /* multiply and accumlate */
  479. acc1 += (q31_t) x11 *y10;
  480. acc1 += (q31_t) x20 *y11;
  481. /* Read next two samples from scratch1 buffer */
  482. x10 = *pScr1;
  483. x11 = *(pScr1 + 1u);
  484. /* multiply and accumlate */
  485. acc3 += (q31_t) x21 *y10;
  486. acc3 += (q31_t) x10 *y11;
  487. /* Read next two samples from scratch2 buffer */
  488. y10 = *(pIn2 + 2u);
  489. y11 = *(pIn2 + 3u);
  490. /* multiply and accumlate */
  491. acc0 += (q31_t) x20 *y10;
  492. acc0 += (q31_t) x21 *y11;
  493. acc2 += (q31_t) x10 *y10;
  494. acc2 += (q31_t) x11 *y11;
  495. acc1 += (q31_t) x21 *y10;
  496. acc1 += (q31_t) x10 *y11;
  497. /* Read next two samples from scratch1 buffer */
  498. x20 = *(pScr1 + 2);
  499. x21 = *(pScr1 + 3);
  500. /* multiply and accumlate */
  501. acc3 += (q31_t) x11 *y10;
  502. acc3 += (q31_t) x20 *y11;
  503. /* update scratch pointers */
  504. pIn2 += 4u;
  505. pScr1 += 4u;
  506. /* Decrement the loop counter */
  507. tapCnt--;
  508. }
  509. /* Update scratch pointer for remaining samples of smaller length sequence */
  510. pScr1 -= 4u;
  511. /* apply same above for remaining samples of smaller length sequence */
  512. tapCnt = (srcBLen) & 3u;
  513. while(tapCnt > 0u)
  514. {
  515. /* accumlate the results */
  516. acc0 += (*pScr1++ * *pIn2);
  517. acc1 += (*pScr1++ * *pIn2);
  518. acc2 += (*pScr1++ * *pIn2);
  519. acc3 += (*pScr1++ * *pIn2++);
  520. pScr1 -= 3u;
  521. /* Decrement the loop counter */
  522. tapCnt--;
  523. }
  524. blkCnt--;
  525. /* Store the results in the accumulators in the destination buffer. */
  526. *pOut++ = __SSAT((acc0 >> 15), 16);
  527. *pOut++ = __SSAT((acc1 >> 15), 16);
  528. *pOut++ = __SSAT((acc2 >> 15), 16);
  529. *pOut++ = __SSAT((acc3 >> 15), 16);
  530. /* Initialization of inputB pointer */
  531. pIn2 = py;
  532. pScratch1 += 4u;
  533. }
  534. blkCnt = numPoints & 0x3;
  535. /* Calculate convolution for remaining samples of Bigger length sequence */
  536. while(blkCnt > 0)
  537. {
  538. /* Initialze temporary scratch pointer as scratch1 */
  539. pScr1 = pScratch1;
  540. /* Clear Accumlators */
  541. acc0 = 0;
  542. tapCnt = (srcBLen) >> 1u;
  543. while(tapCnt > 0u)
  544. {
  545. /* Read next two samples from scratch1 buffer */
  546. x10 = *pScr1++;
  547. x11 = *pScr1++;
  548. /* Read two samples from smaller buffer */
  549. y10 = *pIn2++;
  550. y11 = *pIn2++;
  551. /* multiply and accumlate */
  552. acc0 += (q31_t) x10 *y10;
  553. acc0 += (q31_t) x11 *y11;
  554. /* Decrement the loop counter */
  555. tapCnt--;
  556. }
  557. tapCnt = (srcBLen) & 1u;
  558. /* apply same above for remaining samples of smaller length sequence */
  559. while(tapCnt > 0u)
  560. {
  561. /* accumlate the results */
  562. acc0 += (*pScr1++ * *pIn2++);
  563. /* Decrement the loop counter */
  564. tapCnt--;
  565. }
  566. blkCnt--;
  567. /* Store the result in the accumulator in the destination buffer. */
  568. *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16));
  569. /* Initialization of inputB pointer */
  570. pIn2 = py;
  571. pScratch1 += 1u;
  572. }
  573. /* set status as ARM_MATH_SUCCESS */
  574. status = ARM_MATH_SUCCESS;
  575. }
  576. /* Return to application */
  577. return (status);
  578. }
  579. #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
  580. /**
  581. * @} end of PartialConv group
  582. */