Keyboard firmwares for Atmel AVR and Cortex-M
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

arm_fir_q15.c 21KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691
  1. /* ----------------------------------------------------------------------
  2. * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
  3. *
  4. * $Date: 17. January 2013
  5. * $Revision: V1.4.1
  6. *
  7. * Project: CMSIS DSP Library
  8. * Title: arm_fir_q15.c
  9. *
  10. * Description: Q15 FIR filter processing function.
  11. *
  12. * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
  13. *
  14. * Redistribution and use in source and binary forms, with or without
  15. * modification, are permitted provided that the following conditions
  16. * are met:
  17. * - Redistributions of source code must retain the above copyright
  18. * notice, this list of conditions and the following disclaimer.
  19. * - Redistributions in binary form must reproduce the above copyright
  20. * notice, this list of conditions and the following disclaimer in
  21. * the documentation and/or other materials provided with the
  22. * distribution.
  23. * - Neither the name of ARM LIMITED nor the names of its contributors
  24. * may be used to endorse or promote products derived from this
  25. * software without specific prior written permission.
  26. *
  27. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  30. * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  31. * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  32. * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  33. * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  34. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  35. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  36. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  37. * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38. * POSSIBILITY OF SUCH DAMAGE.
  39. * -------------------------------------------------------------------- */
  40. #include "arm_math.h"
  41. /**
  42. * @ingroup groupFilters
  43. */
  44. /**
  45. * @addtogroup FIR
  46. * @{
  47. */
  48. /**
  49. * @brief Processing function for the Q15 FIR filter.
  50. * @param[in] *S points to an instance of the Q15 FIR structure.
  51. * @param[in] *pSrc points to the block of input data.
  52. * @param[out] *pDst points to the block of output data.
  53. * @param[in] blockSize number of samples to process per call.
  54. * @return none.
  55. *
  56. *
  57. * \par Restrictions
  58. * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
  59. * In this case input, output, state buffers should be aligned by 32-bit
  60. *
  61. * <b>Scaling and Overflow Behavior:</b>
  62. * \par
  63. * The function is implemented using a 64-bit internal accumulator.
  64. * Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result.
  65. * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
  66. * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
  67. * After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits.
  68. * Lastly, the accumulator is saturated to yield a result in 1.15 format.
  69. *
  70. * \par
  71. * Refer to the function <code>arm_fir_fast_q15()</code> for a faster but less precise implementation of this function.
  72. */
  73. #ifndef ARM_MATH_CM0_FAMILY
  74. /* Run the below code for Cortex-M4 and Cortex-M3 */
  75. #ifndef UNALIGNED_SUPPORT_DISABLE
  76. void arm_fir_q15(
  77. const arm_fir_instance_q15 * S,
  78. q15_t * pSrc,
  79. q15_t * pDst,
  80. uint32_t blockSize)
  81. {
  82. q15_t *pState = S->pState; /* State pointer */
  83. q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
  84. q15_t *pStateCurnt; /* Points to the current sample of the state */
  85. q15_t *px1; /* Temporary q15 pointer for state buffer */
  86. q15_t *pb; /* Temporary pointer for coefficient buffer */
  87. q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold SIMD state and coefficient values */
  88. q63_t acc0, acc1, acc2, acc3; /* Accumulators */
  89. uint32_t numTaps = S->numTaps; /* Number of taps in the filter */
  90. uint32_t tapCnt, blkCnt; /* Loop counters */
  91. /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
  92. /* pStateCurnt points to the location where the new input data should be written */
  93. pStateCurnt = &(S->pState[(numTaps - 1u)]);
  94. /* Apply loop unrolling and compute 4 output values simultaneously.
  95. * The variables acc0 ... acc3 hold output values that are being computed:
  96. *
  97. * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
  98. * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
  99. * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
  100. * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3]
  101. */
  102. blkCnt = blockSize >> 2;
  103. /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
  104. ** a second loop below computes the remaining 1 to 3 samples. */
  105. while(blkCnt > 0u)
  106. {
  107. /* Copy four new input samples into the state buffer.
  108. ** Use 32-bit SIMD to move the 16-bit data. Only requires two copies. */
  109. *__SIMD32(pStateCurnt)++ = *__SIMD32(pSrc)++;
  110. *__SIMD32(pStateCurnt)++ = *__SIMD32(pSrc)++;
  111. /* Set all accumulators to zero */
  112. acc0 = 0;
  113. acc1 = 0;
  114. acc2 = 0;
  115. acc3 = 0;
  116. /* Initialize state pointer of type q15 */
  117. px1 = pState;
  118. /* Initialize coeff pointer of type q31 */
  119. pb = pCoeffs;
  120. /* Read the first two samples from the state buffer: x[n-N], x[n-N-1] */
  121. x0 = _SIMD32_OFFSET(px1);
  122. /* Read the third and forth samples from the state buffer: x[n-N-1], x[n-N-2] */
  123. x1 = _SIMD32_OFFSET(px1 + 1u);
  124. px1 += 2u;
  125. /* Loop over the number of taps. Unroll by a factor of 4.
  126. ** Repeat until we've computed numTaps-4 coefficients. */
  127. tapCnt = numTaps >> 2;
  128. while(tapCnt > 0u)
  129. {
  130. /* Read the first two coefficients using SIMD: b[N] and b[N-1] coefficients */
  131. c0 = *__SIMD32(pb)++;
  132. /* acc0 += b[N] * x[n-N] + b[N-1] * x[n-N-1] */
  133. acc0 = __SMLALD(x0, c0, acc0);
  134. /* acc1 += b[N] * x[n-N-1] + b[N-1] * x[n-N-2] */
  135. acc1 = __SMLALD(x1, c0, acc1);
  136. /* Read state x[n-N-2], x[n-N-3] */
  137. x2 = _SIMD32_OFFSET(px1);
  138. /* Read state x[n-N-3], x[n-N-4] */
  139. x3 = _SIMD32_OFFSET(px1 + 1u);
  140. /* acc2 += b[N] * x[n-N-2] + b[N-1] * x[n-N-3] */
  141. acc2 = __SMLALD(x2, c0, acc2);
  142. /* acc3 += b[N] * x[n-N-3] + b[N-1] * x[n-N-4] */
  143. acc3 = __SMLALD(x3, c0, acc3);
  144. /* Read coefficients b[N-2], b[N-3] */
  145. c0 = *__SIMD32(pb)++;
  146. /* acc0 += b[N-2] * x[n-N-2] + b[N-3] * x[n-N-3] */
  147. acc0 = __SMLALD(x2, c0, acc0);
  148. /* acc1 += b[N-2] * x[n-N-3] + b[N-3] * x[n-N-4] */
  149. acc1 = __SMLALD(x3, c0, acc1);
  150. /* Read state x[n-N-4], x[n-N-5] */
  151. x0 = _SIMD32_OFFSET(px1 + 2u);
  152. /* Read state x[n-N-5], x[n-N-6] */
  153. x1 = _SIMD32_OFFSET(px1 + 3u);
  154. /* acc2 += b[N-2] * x[n-N-4] + b[N-3] * x[n-N-5] */
  155. acc2 = __SMLALD(x0, c0, acc2);
  156. /* acc3 += b[N-2] * x[n-N-5] + b[N-3] * x[n-N-6] */
  157. acc3 = __SMLALD(x1, c0, acc3);
  158. px1 += 4u;
  159. tapCnt--;
  160. }
  161. /* If the filter length is not a multiple of 4, compute the remaining filter taps.
  162. ** This is always be 2 taps since the filter length is even. */
  163. if((numTaps & 0x3u) != 0u)
  164. {
  165. /* Read 2 coefficients */
  166. c0 = *__SIMD32(pb)++;
  167. /* Fetch 4 state variables */
  168. x2 = _SIMD32_OFFSET(px1);
  169. x3 = _SIMD32_OFFSET(px1 + 1u);
  170. /* Perform the multiply-accumulates */
  171. acc0 = __SMLALD(x0, c0, acc0);
  172. px1 += 2u;
  173. acc1 = __SMLALD(x1, c0, acc1);
  174. acc2 = __SMLALD(x2, c0, acc2);
  175. acc3 = __SMLALD(x3, c0, acc3);
  176. }
  177. /* The results in the 4 accumulators are in 2.30 format. Convert to 1.15 with saturation.
  178. ** Then store the 4 outputs in the destination buffer. */
  179. #ifndef ARM_MATH_BIG_ENDIAN
  180. *__SIMD32(pDst)++ =
  181. __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
  182. *__SIMD32(pDst)++ =
  183. __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
  184. #else
  185. *__SIMD32(pDst)++ =
  186. __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
  187. *__SIMD32(pDst)++ =
  188. __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
  189. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  190. /* Advance the state pointer by 4 to process the next group of 4 samples */
  191. pState = pState + 4;
  192. /* Decrement the loop counter */
  193. blkCnt--;
  194. }
  195. /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
  196. ** No loop unrolling is used. */
  197. blkCnt = blockSize % 0x4u;
  198. while(blkCnt > 0u)
  199. {
  200. /* Copy two samples into state buffer */
  201. *pStateCurnt++ = *pSrc++;
  202. /* Set the accumulator to zero */
  203. acc0 = 0;
  204. /* Initialize state pointer of type q15 */
  205. px1 = pState;
  206. /* Initialize coeff pointer of type q31 */
  207. pb = pCoeffs;
  208. tapCnt = numTaps >> 1;
  209. do
  210. {
  211. c0 = *__SIMD32(pb)++;
  212. x0 = *__SIMD32(px1)++;
  213. acc0 = __SMLALD(x0, c0, acc0);
  214. tapCnt--;
  215. }
  216. while(tapCnt > 0u);
  217. /* The result is in 2.30 format. Convert to 1.15 with saturation.
  218. ** Then store the output in the destination buffer. */
  219. *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
  220. /* Advance state pointer by 1 for the next sample */
  221. pState = pState + 1;
  222. /* Decrement the loop counter */
  223. blkCnt--;
  224. }
  225. /* Processing is complete.
  226. ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
  227. ** This prepares the state buffer for the next function call. */
  228. /* Points to the start of the state buffer */
  229. pStateCurnt = S->pState;
  230. /* Calculation of count for copying integer writes */
  231. tapCnt = (numTaps - 1u) >> 2;
  232. while(tapCnt > 0u)
  233. {
  234. /* Copy state values to start of state buffer */
  235. *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
  236. *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
  237. tapCnt--;
  238. }
  239. /* Calculation of count for remaining q15_t data */
  240. tapCnt = (numTaps - 1u) % 0x4u;
  241. /* copy remaining data */
  242. while(tapCnt > 0u)
  243. {
  244. *pStateCurnt++ = *pState++;
  245. /* Decrement the loop counter */
  246. tapCnt--;
  247. }
  248. }
  249. #else /* UNALIGNED_SUPPORT_DISABLE */
  250. void arm_fir_q15(
  251. const arm_fir_instance_q15 * S,
  252. q15_t * pSrc,
  253. q15_t * pDst,
  254. uint32_t blockSize)
  255. {
  256. q15_t *pState = S->pState; /* State pointer */
  257. q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
  258. q15_t *pStateCurnt; /* Points to the current sample of the state */
  259. q63_t acc0, acc1, acc2, acc3; /* Accumulators */
  260. q15_t *pb; /* Temporary pointer for coefficient buffer */
  261. q15_t *px; /* Temporary q31 pointer for SIMD state buffer accesses */
  262. q31_t x0, x1, x2, c0; /* Temporary variables to hold SIMD state and coefficient values */
  263. uint32_t numTaps = S->numTaps; /* Number of taps in the filter */
  264. uint32_t tapCnt, blkCnt; /* Loop counters */
  265. /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
  266. /* pStateCurnt points to the location where the new input data should be written */
  267. pStateCurnt = &(S->pState[(numTaps - 1u)]);
  268. /* Apply loop unrolling and compute 4 output values simultaneously.
  269. * The variables acc0 ... acc3 hold output values that are being computed:
  270. *
  271. * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
  272. * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
  273. * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
  274. * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3]
  275. */
  276. blkCnt = blockSize >> 2;
  277. /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
  278. ** a second loop below computes the remaining 1 to 3 samples. */
  279. while(blkCnt > 0u)
  280. {
  281. /* Copy four new input samples into the state buffer.
  282. ** Use 32-bit SIMD to move the 16-bit data. Only requires two copies. */
  283. *pStateCurnt++ = *pSrc++;
  284. *pStateCurnt++ = *pSrc++;
  285. *pStateCurnt++ = *pSrc++;
  286. *pStateCurnt++ = *pSrc++;
  287. /* Set all accumulators to zero */
  288. acc0 = 0;
  289. acc1 = 0;
  290. acc2 = 0;
  291. acc3 = 0;
  292. /* Typecast q15_t pointer to q31_t pointer for state reading in q31_t */
  293. px = pState;
  294. /* Typecast q15_t pointer to q31_t pointer for coefficient reading in q31_t */
  295. pb = pCoeffs;
  296. /* Read the first two samples from the state buffer: x[n-N], x[n-N-1] */
  297. x0 = *__SIMD32(px)++;
  298. /* Read the third and forth samples from the state buffer: x[n-N-2], x[n-N-3] */
  299. x2 = *__SIMD32(px)++;
  300. /* Loop over the number of taps. Unroll by a factor of 4.
  301. ** Repeat until we've computed numTaps-(numTaps%4) coefficients. */
  302. tapCnt = numTaps >> 2;
  303. while(tapCnt > 0)
  304. {
  305. /* Read the first two coefficients using SIMD: b[N] and b[N-1] coefficients */
  306. c0 = *__SIMD32(pb)++;
  307. /* acc0 += b[N] * x[n-N] + b[N-1] * x[n-N-1] */
  308. acc0 = __SMLALD(x0, c0, acc0);
  309. /* acc2 += b[N] * x[n-N-2] + b[N-1] * x[n-N-3] */
  310. acc2 = __SMLALD(x2, c0, acc2);
  311. /* pack x[n-N-1] and x[n-N-2] */
  312. #ifndef ARM_MATH_BIG_ENDIAN
  313. x1 = __PKHBT(x2, x0, 0);
  314. #else
  315. x1 = __PKHBT(x0, x2, 0);
  316. #endif
  317. /* Read state x[n-N-4], x[n-N-5] */
  318. x0 = _SIMD32_OFFSET(px);
  319. /* acc1 += b[N] * x[n-N-1] + b[N-1] * x[n-N-2] */
  320. acc1 = __SMLALDX(x1, c0, acc1);
  321. /* pack x[n-N-3] and x[n-N-4] */
  322. #ifndef ARM_MATH_BIG_ENDIAN
  323. x1 = __PKHBT(x0, x2, 0);
  324. #else
  325. x1 = __PKHBT(x2, x0, 0);
  326. #endif
  327. /* acc3 += b[N] * x[n-N-3] + b[N-1] * x[n-N-4] */
  328. acc3 = __SMLALDX(x1, c0, acc3);
  329. /* Read coefficients b[N-2], b[N-3] */
  330. c0 = *__SIMD32(pb)++;
  331. /* acc0 += b[N-2] * x[n-N-2] + b[N-3] * x[n-N-3] */
  332. acc0 = __SMLALD(x2, c0, acc0);
  333. /* Read state x[n-N-6], x[n-N-7] with offset */
  334. x2 = _SIMD32_OFFSET(px + 2u);
  335. /* acc2 += b[N-2] * x[n-N-4] + b[N-3] * x[n-N-5] */
  336. acc2 = __SMLALD(x0, c0, acc2);
  337. /* acc1 += b[N-2] * x[n-N-3] + b[N-3] * x[n-N-4] */
  338. acc1 = __SMLALDX(x1, c0, acc1);
  339. /* pack x[n-N-5] and x[n-N-6] */
  340. #ifndef ARM_MATH_BIG_ENDIAN
  341. x1 = __PKHBT(x2, x0, 0);
  342. #else
  343. x1 = __PKHBT(x0, x2, 0);
  344. #endif
  345. /* acc3 += b[N-2] * x[n-N-5] + b[N-3] * x[n-N-6] */
  346. acc3 = __SMLALDX(x1, c0, acc3);
  347. /* Update state pointer for next state reading */
  348. px += 4u;
  349. /* Decrement tap count */
  350. tapCnt--;
  351. }
  352. /* If the filter length is not a multiple of 4, compute the remaining filter taps.
  353. ** This is always be 2 taps since the filter length is even. */
  354. if((numTaps & 0x3u) != 0u)
  355. {
  356. /* Read last two coefficients */
  357. c0 = *__SIMD32(pb)++;
  358. /* Perform the multiply-accumulates */
  359. acc0 = __SMLALD(x0, c0, acc0);
  360. acc2 = __SMLALD(x2, c0, acc2);
  361. /* pack state variables */
  362. #ifndef ARM_MATH_BIG_ENDIAN
  363. x1 = __PKHBT(x2, x0, 0);
  364. #else
  365. x1 = __PKHBT(x0, x2, 0);
  366. #endif
  367. /* Read last state variables */
  368. x0 = *__SIMD32(px);
  369. /* Perform the multiply-accumulates */
  370. acc1 = __SMLALDX(x1, c0, acc1);
  371. /* pack state variables */
  372. #ifndef ARM_MATH_BIG_ENDIAN
  373. x1 = __PKHBT(x0, x2, 0);
  374. #else
  375. x1 = __PKHBT(x2, x0, 0);
  376. #endif
  377. /* Perform the multiply-accumulates */
  378. acc3 = __SMLALDX(x1, c0, acc3);
  379. }
  380. /* The results in the 4 accumulators are in 2.30 format. Convert to 1.15 with saturation.
  381. ** Then store the 4 outputs in the destination buffer. */
  382. #ifndef ARM_MATH_BIG_ENDIAN
  383. *__SIMD32(pDst)++ =
  384. __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
  385. *__SIMD32(pDst)++ =
  386. __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
  387. #else
  388. *__SIMD32(pDst)++ =
  389. __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
  390. *__SIMD32(pDst)++ =
  391. __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
  392. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  393. /* Advance the state pointer by 4 to process the next group of 4 samples */
  394. pState = pState + 4;
  395. /* Decrement the loop counter */
  396. blkCnt--;
  397. }
  398. /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
  399. ** No loop unrolling is used. */
  400. blkCnt = blockSize % 0x4u;
  401. while(blkCnt > 0u)
  402. {
  403. /* Copy two samples into state buffer */
  404. *pStateCurnt++ = *pSrc++;
  405. /* Set the accumulator to zero */
  406. acc0 = 0;
  407. /* Use SIMD to hold states and coefficients */
  408. px = pState;
  409. pb = pCoeffs;
  410. tapCnt = numTaps >> 1u;
  411. do
  412. {
  413. acc0 += (q31_t) * px++ * *pb++;
  414. acc0 += (q31_t) * px++ * *pb++;
  415. tapCnt--;
  416. }
  417. while(tapCnt > 0u);
  418. /* The result is in 2.30 format. Convert to 1.15 with saturation.
  419. ** Then store the output in the destination buffer. */
  420. *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
  421. /* Advance state pointer by 1 for the next sample */
  422. pState = pState + 1u;
  423. /* Decrement the loop counter */
  424. blkCnt--;
  425. }
  426. /* Processing is complete.
  427. ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
  428. ** This prepares the state buffer for the next function call. */
  429. /* Points to the start of the state buffer */
  430. pStateCurnt = S->pState;
  431. /* Calculation of count for copying integer writes */
  432. tapCnt = (numTaps - 1u) >> 2;
  433. while(tapCnt > 0u)
  434. {
  435. *pStateCurnt++ = *pState++;
  436. *pStateCurnt++ = *pState++;
  437. *pStateCurnt++ = *pState++;
  438. *pStateCurnt++ = *pState++;
  439. tapCnt--;
  440. }
  441. /* Calculation of count for remaining q15_t data */
  442. tapCnt = (numTaps - 1u) % 0x4u;
  443. /* copy remaining data */
  444. while(tapCnt > 0u)
  445. {
  446. *pStateCurnt++ = *pState++;
  447. /* Decrement the loop counter */
  448. tapCnt--;
  449. }
  450. }
  451. #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
  452. #else /* ARM_MATH_CM0_FAMILY */
  453. /* Run the below code for Cortex-M0 */
  454. void arm_fir_q15(
  455. const arm_fir_instance_q15 * S,
  456. q15_t * pSrc,
  457. q15_t * pDst,
  458. uint32_t blockSize)
  459. {
  460. q15_t *pState = S->pState; /* State pointer */
  461. q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
  462. q15_t *pStateCurnt; /* Points to the current sample of the state */
  463. q15_t *px; /* Temporary pointer for state buffer */
  464. q15_t *pb; /* Temporary pointer for coefficient buffer */
  465. q63_t acc; /* Accumulator */
  466. uint32_t numTaps = S->numTaps; /* Number of nTaps in the filter */
  467. uint32_t tapCnt, blkCnt; /* Loop counters */
  468. /* S->pState buffer contains previous frame (numTaps - 1) samples */
  469. /* pStateCurnt points to the location where the new input data should be written */
  470. pStateCurnt = &(S->pState[(numTaps - 1u)]);
  471. /* Initialize blkCnt with blockSize */
  472. blkCnt = blockSize;
  473. while(blkCnt > 0u)
  474. {
  475. /* Copy one sample at a time into state buffer */
  476. *pStateCurnt++ = *pSrc++;
  477. /* Set the accumulator to zero */
  478. acc = 0;
  479. /* Initialize state pointer */
  480. px = pState;
  481. /* Initialize Coefficient pointer */
  482. pb = pCoeffs;
  483. tapCnt = numTaps;
  484. /* Perform the multiply-accumulates */
  485. do
  486. {
  487. /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
  488. acc += (q31_t) * px++ * *pb++;
  489. tapCnt--;
  490. } while(tapCnt > 0u);
  491. /* The result is in 2.30 format. Convert to 1.15
  492. ** Then store the output in the destination buffer. */
  493. *pDst++ = (q15_t) __SSAT((acc >> 15u), 16);
  494. /* Advance state pointer by 1 for the next sample */
  495. pState = pState + 1;
  496. /* Decrement the samples loop counter */
  497. blkCnt--;
  498. }
  499. /* Processing is complete.
  500. ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
  501. ** This prepares the state buffer for the next function call. */
  502. /* Points to the start of the state buffer */
  503. pStateCurnt = S->pState;
  504. /* Copy numTaps number of values */
  505. tapCnt = (numTaps - 1u);
  506. /* copy data */
  507. while(tapCnt > 0u)
  508. {
  509. *pStateCurnt++ = *pState++;
  510. /* Decrement the loop counter */
  511. tapCnt--;
  512. }
  513. }
  514. #endif /* #ifndef ARM_MATH_CM0_FAMILY */
  515. /**
  516. * @} end of FIR group
  517. */