Keyboard firmwares for Atmel AVR and Cortex-M
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

arm_conv_partial_opt_q7.c 21KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807
  1. /* ----------------------------------------------------------------------
  2. * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
  3. *
  4. * $Date: 17. January 2013
  5. * $Revision: V1.4.1
  6. *
  7. * Project: CMSIS DSP Library
  8. * Title: arm_conv_partial_opt_q7.c
  9. *
  10. * Description: Partial convolution of Q7 sequences.
  11. *
  12. * Target Processor: Cortex-M4/Cortex-M3
  13. *
  14. * Redistribution and use in source and binary forms, with or without
  15. * modification, are permitted provided that the following conditions
  16. * are met:
  17. * - Redistributions of source code must retain the above copyright
  18. * notice, this list of conditions and the following disclaimer.
  19. * - Redistributions in binary form must reproduce the above copyright
  20. * notice, this list of conditions and the following disclaimer in
  21. * the documentation and/or other materials provided with the
  22. * distribution.
  23. * - Neither the name of ARM LIMITED nor the names of its contributors
  24. * may be used to endorse or promote products derived from this
  25. * software without specific prior written permission.
  26. *
  27. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  30. * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  31. * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  32. * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  33. * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  34. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  35. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  36. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  37. * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38. * POSSIBILITY OF SUCH DAMAGE.
  39. * -------------------------------------------------------------------- */
  40. #include "arm_math.h"
  41. /**
  42. * @ingroup groupFilters
  43. */
  44. /**
  45. * @addtogroup PartialConv
  46. * @{
  47. */
  48. /**
  49. * @brief Partial convolution of Q7 sequences.
  50. * @param[in] *pSrcA points to the first input sequence.
  51. * @param[in] srcALen length of the first input sequence.
  52. * @param[in] *pSrcB points to the second input sequence.
  53. * @param[in] srcBLen length of the second input sequence.
  54. * @param[out] *pDst points to the location where the output result is written.
  55. * @param[in] firstIndex is the first output sample to start with.
  56. * @param[in] numPoints is the number of output points to be computed.
  57. * @param[in] *pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
  58. * @param[in] *pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).
  59. * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
  60. *
  61. * \par Restrictions
  62. * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
  63. * In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit
  64. *
  65. *
  66. *
  67. */
  68. #ifndef UNALIGNED_SUPPORT_DISABLE
  69. arm_status arm_conv_partial_opt_q7(
  70. q7_t * pSrcA,
  71. uint32_t srcALen,
  72. q7_t * pSrcB,
  73. uint32_t srcBLen,
  74. q7_t * pDst,
  75. uint32_t firstIndex,
  76. uint32_t numPoints,
  77. q15_t * pScratch1,
  78. q15_t * pScratch2)
  79. {
  80. q15_t *pScr2, *pScr1; /* Intermediate pointers for scratch pointers */
  81. q15_t x4; /* Temporary input variable */
  82. q7_t *pIn1, *pIn2; /* inputA and inputB pointer */
  83. uint32_t j, k, blkCnt, tapCnt; /* loop counter */
  84. q7_t *px; /* Temporary input1 pointer */
  85. q15_t *py; /* Temporary input2 pointer */
  86. q31_t acc0, acc1, acc2, acc3; /* Accumulator */
  87. q31_t x1, x2, x3, y1; /* Temporary input variables */
  88. arm_status status;
  89. q7_t *pOut = pDst; /* output pointer */
  90. q7_t out0, out1, out2, out3; /* temporary variables */
  91. /* Check for range of output samples to be calculated */
  92. if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
  93. {
  94. /* Set status as ARM_MATH_ARGUMENT_ERROR */
  95. status = ARM_MATH_ARGUMENT_ERROR;
  96. }
  97. else
  98. {
  99. /* The algorithm implementation is based on the lengths of the inputs. */
  100. /* srcB is always made to slide across srcA. */
  101. /* So srcBLen is always considered as shorter or equal to srcALen */
  102. if(srcALen >= srcBLen)
  103. {
  104. /* Initialization of inputA pointer */
  105. pIn1 = pSrcA;
  106. /* Initialization of inputB pointer */
  107. pIn2 = pSrcB;
  108. }
  109. else
  110. {
  111. /* Initialization of inputA pointer */
  112. pIn1 = pSrcB;
  113. /* Initialization of inputB pointer */
  114. pIn2 = pSrcA;
  115. /* srcBLen is always considered as shorter or equal to srcALen */
  116. j = srcBLen;
  117. srcBLen = srcALen;
  118. srcALen = j;
  119. }
  120. /* pointer to take end of scratch2 buffer */
  121. pScr2 = pScratch2;
  122. /* points to smaller length sequence */
  123. px = pIn2 + srcBLen - 1;
  124. /* Apply loop unrolling and do 4 Copies simultaneously. */
  125. k = srcBLen >> 2u;
  126. /* First part of the processing with loop unrolling copies 4 data points at a time.
  127. ** a second loop below copies for the remaining 1 to 3 samples. */
  128. while(k > 0u)
  129. {
  130. /* copy second buffer in reversal manner */
  131. x4 = (q15_t) * px--;
  132. *pScr2++ = x4;
  133. x4 = (q15_t) * px--;
  134. *pScr2++ = x4;
  135. x4 = (q15_t) * px--;
  136. *pScr2++ = x4;
  137. x4 = (q15_t) * px--;
  138. *pScr2++ = x4;
  139. /* Decrement the loop counter */
  140. k--;
  141. }
  142. /* If the count is not a multiple of 4, copy remaining samples here.
  143. ** No loop unrolling is used. */
  144. k = srcBLen % 0x4u;
  145. while(k > 0u)
  146. {
  147. /* copy second buffer in reversal manner for remaining samples */
  148. x4 = (q15_t) * px--;
  149. *pScr2++ = x4;
  150. /* Decrement the loop counter */
  151. k--;
  152. }
  153. /* Initialze temporary scratch pointer */
  154. pScr1 = pScratch1;
  155. /* Fill (srcBLen - 1u) zeros in scratch buffer */
  156. arm_fill_q15(0, pScr1, (srcBLen - 1u));
  157. /* Update temporary scratch pointer */
  158. pScr1 += (srcBLen - 1u);
  159. /* Copy (srcALen) samples in scratch buffer */
  160. /* Apply loop unrolling and do 4 Copies simultaneously. */
  161. k = srcALen >> 2u;
  162. /* First part of the processing with loop unrolling copies 4 data points at a time.
  163. ** a second loop below copies for the remaining 1 to 3 samples. */
  164. while(k > 0u)
  165. {
  166. /* copy second buffer in reversal manner */
  167. x4 = (q15_t) * pIn1++;
  168. *pScr1++ = x4;
  169. x4 = (q15_t) * pIn1++;
  170. *pScr1++ = x4;
  171. x4 = (q15_t) * pIn1++;
  172. *pScr1++ = x4;
  173. x4 = (q15_t) * pIn1++;
  174. *pScr1++ = x4;
  175. /* Decrement the loop counter */
  176. k--;
  177. }
  178. /* If the count is not a multiple of 4, copy remaining samples here.
  179. ** No loop unrolling is used. */
  180. k = srcALen % 0x4u;
  181. while(k > 0u)
  182. {
  183. /* copy second buffer in reversal manner for remaining samples */
  184. x4 = (q15_t) * pIn1++;
  185. *pScr1++ = x4;
  186. /* Decrement the loop counter */
  187. k--;
  188. }
  189. /* Fill (srcBLen - 1u) zeros at end of scratch buffer */
  190. arm_fill_q15(0, pScr1, (srcBLen - 1u));
  191. /* Update pointer */
  192. pScr1 += (srcBLen - 1u);
  193. /* Temporary pointer for scratch2 */
  194. py = pScratch2;
  195. /* Initialization of pIn2 pointer */
  196. pIn2 = (q7_t *) py;
  197. pScr2 = py;
  198. pOut = pDst + firstIndex;
  199. pScratch1 += firstIndex;
  200. /* Actual convolution process starts here */
  201. blkCnt = (numPoints) >> 2;
  202. while(blkCnt > 0)
  203. {
  204. /* Initialze temporary scratch pointer as scratch1 */
  205. pScr1 = pScratch1;
  206. /* Clear Accumlators */
  207. acc0 = 0;
  208. acc1 = 0;
  209. acc2 = 0;
  210. acc3 = 0;
  211. /* Read two samples from scratch1 buffer */
  212. x1 = *__SIMD32(pScr1)++;
  213. /* Read next two samples from scratch1 buffer */
  214. x2 = *__SIMD32(pScr1)++;
  215. tapCnt = (srcBLen) >> 2u;
  216. while(tapCnt > 0u)
  217. {
  218. /* Read four samples from smaller buffer */
  219. y1 = _SIMD32_OFFSET(pScr2);
  220. /* multiply and accumlate */
  221. acc0 = __SMLAD(x1, y1, acc0);
  222. acc2 = __SMLAD(x2, y1, acc2);
  223. /* pack input data */
  224. #ifndef ARM_MATH_BIG_ENDIAN
  225. x3 = __PKHBT(x2, x1, 0);
  226. #else
  227. x3 = __PKHBT(x1, x2, 0);
  228. #endif
  229. /* multiply and accumlate */
  230. acc1 = __SMLADX(x3, y1, acc1);
  231. /* Read next two samples from scratch1 buffer */
  232. x1 = *__SIMD32(pScr1)++;
  233. /* pack input data */
  234. #ifndef ARM_MATH_BIG_ENDIAN
  235. x3 = __PKHBT(x1, x2, 0);
  236. #else
  237. x3 = __PKHBT(x2, x1, 0);
  238. #endif
  239. acc3 = __SMLADX(x3, y1, acc3);
  240. /* Read four samples from smaller buffer */
  241. y1 = _SIMD32_OFFSET(pScr2 + 2u);
  242. acc0 = __SMLAD(x2, y1, acc0);
  243. acc2 = __SMLAD(x1, y1, acc2);
  244. acc1 = __SMLADX(x3, y1, acc1);
  245. x2 = *__SIMD32(pScr1)++;
  246. #ifndef ARM_MATH_BIG_ENDIAN
  247. x3 = __PKHBT(x2, x1, 0);
  248. #else
  249. x3 = __PKHBT(x1, x2, 0);
  250. #endif
  251. acc3 = __SMLADX(x3, y1, acc3);
  252. pScr2 += 4u;
  253. /* Decrement the loop counter */
  254. tapCnt--;
  255. }
  256. /* Update scratch pointer for remaining samples of smaller length sequence */
  257. pScr1 -= 4u;
  258. /* apply same above for remaining samples of smaller length sequence */
  259. tapCnt = (srcBLen) & 3u;
  260. while(tapCnt > 0u)
  261. {
  262. /* accumlate the results */
  263. acc0 += (*pScr1++ * *pScr2);
  264. acc1 += (*pScr1++ * *pScr2);
  265. acc2 += (*pScr1++ * *pScr2);
  266. acc3 += (*pScr1++ * *pScr2++);
  267. pScr1 -= 3u;
  268. /* Decrement the loop counter */
  269. tapCnt--;
  270. }
  271. blkCnt--;
  272. /* Store the result in the accumulator in the destination buffer. */
  273. out0 = (q7_t) (__SSAT(acc0 >> 7u, 8));
  274. out1 = (q7_t) (__SSAT(acc1 >> 7u, 8));
  275. out2 = (q7_t) (__SSAT(acc2 >> 7u, 8));
  276. out3 = (q7_t) (__SSAT(acc3 >> 7u, 8));
  277. *__SIMD32(pOut)++ = __PACKq7(out0, out1, out2, out3);
  278. /* Initialization of inputB pointer */
  279. pScr2 = py;
  280. pScratch1 += 4u;
  281. }
  282. blkCnt = (numPoints) & 0x3;
  283. /* Calculate convolution for remaining samples of Bigger length sequence */
  284. while(blkCnt > 0)
  285. {
  286. /* Initialze temporary scratch pointer as scratch1 */
  287. pScr1 = pScratch1;
  288. /* Clear Accumlators */
  289. acc0 = 0;
  290. tapCnt = (srcBLen) >> 1u;
  291. while(tapCnt > 0u)
  292. {
  293. /* Read next two samples from scratch1 buffer */
  294. x1 = *__SIMD32(pScr1)++;
  295. /* Read two samples from smaller buffer */
  296. y1 = *__SIMD32(pScr2)++;
  297. acc0 = __SMLAD(x1, y1, acc0);
  298. /* Decrement the loop counter */
  299. tapCnt--;
  300. }
  301. tapCnt = (srcBLen) & 1u;
  302. /* apply same above for remaining samples of smaller length sequence */
  303. while(tapCnt > 0u)
  304. {
  305. /* accumlate the results */
  306. acc0 += (*pScr1++ * *pScr2++);
  307. /* Decrement the loop counter */
  308. tapCnt--;
  309. }
  310. blkCnt--;
  311. /* Store the result in the accumulator in the destination buffer. */
  312. *pOut++ = (q7_t) (__SSAT(acc0 >> 7u, 8));
  313. /* Initialization of inputB pointer */
  314. pScr2 = py;
  315. pScratch1 += 1u;
  316. }
  317. /* set status as ARM_MATH_SUCCESS */
  318. status = ARM_MATH_SUCCESS;
  319. }
  320. return (status);
  321. }
  322. #else
  323. arm_status arm_conv_partial_opt_q7(
  324. q7_t * pSrcA,
  325. uint32_t srcALen,
  326. q7_t * pSrcB,
  327. uint32_t srcBLen,
  328. q7_t * pDst,
  329. uint32_t firstIndex,
  330. uint32_t numPoints,
  331. q15_t * pScratch1,
  332. q15_t * pScratch2)
  333. {
  334. q15_t *pScr2, *pScr1; /* Intermediate pointers for scratch pointers */
  335. q15_t x4; /* Temporary input variable */
  336. q7_t *pIn1, *pIn2; /* inputA and inputB pointer */
  337. uint32_t j, k, blkCnt, tapCnt; /* loop counter */
  338. q7_t *px; /* Temporary input1 pointer */
  339. q15_t *py; /* Temporary input2 pointer */
  340. q31_t acc0, acc1, acc2, acc3; /* Accumulator */
  341. arm_status status;
  342. q7_t *pOut = pDst; /* output pointer */
  343. q15_t x10, x11, x20, x21; /* Temporary input variables */
  344. q15_t y10, y11; /* Temporary input variables */
  345. q7_t out0, out1, out2, out3; /* temporary variables */
  346. /* Check for range of output samples to be calculated */
  347. if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
  348. {
  349. /* Set status as ARM_MATH_ARGUMENT_ERROR */
  350. status = ARM_MATH_ARGUMENT_ERROR;
  351. }
  352. else
  353. {
  354. /* The algorithm implementation is based on the lengths of the inputs. */
  355. /* srcB is always made to slide across srcA. */
  356. /* So srcBLen is always considered as shorter or equal to srcALen */
  357. if(srcALen >= srcBLen)
  358. {
  359. /* Initialization of inputA pointer */
  360. pIn1 = pSrcA;
  361. /* Initialization of inputB pointer */
  362. pIn2 = pSrcB;
  363. }
  364. else
  365. {
  366. /* Initialization of inputA pointer */
  367. pIn1 = pSrcB;
  368. /* Initialization of inputB pointer */
  369. pIn2 = pSrcA;
  370. /* srcBLen is always considered as shorter or equal to srcALen */
  371. j = srcBLen;
  372. srcBLen = srcALen;
  373. srcALen = j;
  374. }
  375. /* pointer to take end of scratch2 buffer */
  376. pScr2 = pScratch2;
  377. /* points to smaller length sequence */
  378. px = pIn2 + srcBLen - 1;
  379. /* Apply loop unrolling and do 4 Copies simultaneously. */
  380. k = srcBLen >> 2u;
  381. /* First part of the processing with loop unrolling copies 4 data points at a time.
  382. ** a second loop below copies for the remaining 1 to 3 samples. */
  383. while(k > 0u)
  384. {
  385. /* copy second buffer in reversal manner */
  386. x4 = (q15_t) * px--;
  387. *pScr2++ = x4;
  388. x4 = (q15_t) * px--;
  389. *pScr2++ = x4;
  390. x4 = (q15_t) * px--;
  391. *pScr2++ = x4;
  392. x4 = (q15_t) * px--;
  393. *pScr2++ = x4;
  394. /* Decrement the loop counter */
  395. k--;
  396. }
  397. /* If the count is not a multiple of 4, copy remaining samples here.
  398. ** No loop unrolling is used. */
  399. k = srcBLen % 0x4u;
  400. while(k > 0u)
  401. {
  402. /* copy second buffer in reversal manner for remaining samples */
  403. x4 = (q15_t) * px--;
  404. *pScr2++ = x4;
  405. /* Decrement the loop counter */
  406. k--;
  407. }
  408. /* Initialze temporary scratch pointer */
  409. pScr1 = pScratch1;
  410. /* Fill (srcBLen - 1u) zeros in scratch buffer */
  411. arm_fill_q15(0, pScr1, (srcBLen - 1u));
  412. /* Update temporary scratch pointer */
  413. pScr1 += (srcBLen - 1u);
  414. /* Copy (srcALen) samples in scratch buffer */
  415. /* Apply loop unrolling and do 4 Copies simultaneously. */
  416. k = srcALen >> 2u;
  417. /* First part of the processing with loop unrolling copies 4 data points at a time.
  418. ** a second loop below copies for the remaining 1 to 3 samples. */
  419. while(k > 0u)
  420. {
  421. /* copy second buffer in reversal manner */
  422. x4 = (q15_t) * pIn1++;
  423. *pScr1++ = x4;
  424. x4 = (q15_t) * pIn1++;
  425. *pScr1++ = x4;
  426. x4 = (q15_t) * pIn1++;
  427. *pScr1++ = x4;
  428. x4 = (q15_t) * pIn1++;
  429. *pScr1++ = x4;
  430. /* Decrement the loop counter */
  431. k--;
  432. }
  433. /* If the count is not a multiple of 4, copy remaining samples here.
  434. ** No loop unrolling is used. */
  435. k = srcALen % 0x4u;
  436. while(k > 0u)
  437. {
  438. /* copy second buffer in reversal manner for remaining samples */
  439. x4 = (q15_t) * pIn1++;
  440. *pScr1++ = x4;
  441. /* Decrement the loop counter */
  442. k--;
  443. }
  444. /* Apply loop unrolling and do 4 Copies simultaneously. */
  445. k = (srcBLen - 1u) >> 2u;
  446. /* First part of the processing with loop unrolling copies 4 data points at a time.
  447. ** a second loop below copies for the remaining 1 to 3 samples. */
  448. while(k > 0u)
  449. {
  450. /* copy second buffer in reversal manner */
  451. *pScr1++ = 0;
  452. *pScr1++ = 0;
  453. *pScr1++ = 0;
  454. *pScr1++ = 0;
  455. /* Decrement the loop counter */
  456. k--;
  457. }
  458. /* If the count is not a multiple of 4, copy remaining samples here.
  459. ** No loop unrolling is used. */
  460. k = (srcBLen - 1u) % 0x4u;
  461. while(k > 0u)
  462. {
  463. /* copy second buffer in reversal manner for remaining samples */
  464. *pScr1++ = 0;
  465. /* Decrement the loop counter */
  466. k--;
  467. }
  468. /* Temporary pointer for scratch2 */
  469. py = pScratch2;
  470. /* Initialization of pIn2 pointer */
  471. pIn2 = (q7_t *) py;
  472. pScr2 = py;
  473. pOut = pDst + firstIndex;
  474. pScratch1 += firstIndex;
  475. /* Actual convolution process starts here */
  476. blkCnt = (numPoints) >> 2;
  477. while(blkCnt > 0)
  478. {
  479. /* Initialze temporary scratch pointer as scratch1 */
  480. pScr1 = pScratch1;
  481. /* Clear Accumlators */
  482. acc0 = 0;
  483. acc1 = 0;
  484. acc2 = 0;
  485. acc3 = 0;
  486. /* Read two samples from scratch1 buffer */
  487. x10 = *pScr1++;
  488. x11 = *pScr1++;
  489. /* Read next two samples from scratch1 buffer */
  490. x20 = *pScr1++;
  491. x21 = *pScr1++;
  492. tapCnt = (srcBLen) >> 2u;
  493. while(tapCnt > 0u)
  494. {
  495. /* Read four samples from smaller buffer */
  496. y10 = *pScr2;
  497. y11 = *(pScr2 + 1u);
  498. /* multiply and accumlate */
  499. acc0 += (q31_t) x10 *y10;
  500. acc0 += (q31_t) x11 *y11;
  501. acc2 += (q31_t) x20 *y10;
  502. acc2 += (q31_t) x21 *y11;
  503. acc1 += (q31_t) x11 *y10;
  504. acc1 += (q31_t) x20 *y11;
  505. /* Read next two samples from scratch1 buffer */
  506. x10 = *pScr1;
  507. x11 = *(pScr1 + 1u);
  508. /* multiply and accumlate */
  509. acc3 += (q31_t) x21 *y10;
  510. acc3 += (q31_t) x10 *y11;
  511. /* Read next two samples from scratch2 buffer */
  512. y10 = *(pScr2 + 2u);
  513. y11 = *(pScr2 + 3u);
  514. /* multiply and accumlate */
  515. acc0 += (q31_t) x20 *y10;
  516. acc0 += (q31_t) x21 *y11;
  517. acc2 += (q31_t) x10 *y10;
  518. acc2 += (q31_t) x11 *y11;
  519. acc1 += (q31_t) x21 *y10;
  520. acc1 += (q31_t) x10 *y11;
  521. /* Read next two samples from scratch1 buffer */
  522. x20 = *(pScr1 + 2);
  523. x21 = *(pScr1 + 3);
  524. /* multiply and accumlate */
  525. acc3 += (q31_t) x11 *y10;
  526. acc3 += (q31_t) x20 *y11;
  527. /* update scratch pointers */
  528. pScr1 += 4u;
  529. pScr2 += 4u;
  530. /* Decrement the loop counter */
  531. tapCnt--;
  532. }
  533. /* Update scratch pointer for remaining samples of smaller length sequence */
  534. pScr1 -= 4u;
  535. /* apply same above for remaining samples of smaller length sequence */
  536. tapCnt = (srcBLen) & 3u;
  537. while(tapCnt > 0u)
  538. {
  539. /* accumlate the results */
  540. acc0 += (*pScr1++ * *pScr2);
  541. acc1 += (*pScr1++ * *pScr2);
  542. acc2 += (*pScr1++ * *pScr2);
  543. acc3 += (*pScr1++ * *pScr2++);
  544. pScr1 -= 3u;
  545. /* Decrement the loop counter */
  546. tapCnt--;
  547. }
  548. blkCnt--;
  549. /* Store the result in the accumulator in the destination buffer. */
  550. out0 = (q7_t) (__SSAT(acc0 >> 7u, 8));
  551. out1 = (q7_t) (__SSAT(acc1 >> 7u, 8));
  552. out2 = (q7_t) (__SSAT(acc2 >> 7u, 8));
  553. out3 = (q7_t) (__SSAT(acc3 >> 7u, 8));
  554. *__SIMD32(pOut)++ = __PACKq7(out0, out1, out2, out3);
  555. /* Initialization of inputB pointer */
  556. pScr2 = py;
  557. pScratch1 += 4u;
  558. }
  559. blkCnt = (numPoints) & 0x3;
  560. /* Calculate convolution for remaining samples of Bigger length sequence */
  561. while(blkCnt > 0)
  562. {
  563. /* Initialze temporary scratch pointer as scratch1 */
  564. pScr1 = pScratch1;
  565. /* Clear Accumlators */
  566. acc0 = 0;
  567. tapCnt = (srcBLen) >> 1u;
  568. while(tapCnt > 0u)
  569. {
  570. /* Read next two samples from scratch1 buffer */
  571. x10 = *pScr1++;
  572. x11 = *pScr1++;
  573. /* Read two samples from smaller buffer */
  574. y10 = *pScr2++;
  575. y11 = *pScr2++;
  576. /* multiply and accumlate */
  577. acc0 += (q31_t) x10 *y10;
  578. acc0 += (q31_t) x11 *y11;
  579. /* Decrement the loop counter */
  580. tapCnt--;
  581. }
  582. tapCnt = (srcBLen) & 1u;
  583. /* apply same above for remaining samples of smaller length sequence */
  584. while(tapCnt > 0u)
  585. {
  586. /* accumlate the results */
  587. acc0 += (*pScr1++ * *pScr2++);
  588. /* Decrement the loop counter */
  589. tapCnt--;
  590. }
  591. blkCnt--;
  592. /* Store the result in the accumulator in the destination buffer. */
  593. *pOut++ = (q7_t) (__SSAT(acc0 >> 7u, 8));
  594. /* Initialization of inputB pointer */
  595. pScr2 = py;
  596. pScratch1 += 1u;
  597. }
  598. /* set status as ARM_MATH_SUCCESS */
  599. status = ARM_MATH_SUCCESS;
  600. }
  601. return (status);
  602. }
  603. #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
  604. /**
  605. * @} end of PartialConv group
  606. */