Keyboard firmwares for Atmel AVR and Cortex-M
選択できるのは25トピックまでです。 トピックは、先頭が英数字で、英数字とダッシュ('-')を使用した35文字以内のものにしてください。

arm_fir_decimate_fast_q15.c 16KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598
  1. /* ----------------------------------------------------------------------
  2. * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
  3. *
  4. * $Date: 17. January 2013
  5. * $Revision: V1.4.1
  6. *
  7. * Project: CMSIS DSP Library
  8. * Title: arm_fir_decimate_fast_q15.c
  9. *
  10. * Description: Fast Q15 FIR Decimator.
  11. *
  12. * Target Processor: Cortex-M4/Cortex-M3
  13. *
  14. * Redistribution and use in source and binary forms, with or without
  15. * modification, are permitted provided that the following conditions
  16. * are met:
  17. * - Redistributions of source code must retain the above copyright
  18. * notice, this list of conditions and the following disclaimer.
  19. * - Redistributions in binary form must reproduce the above copyright
  20. * notice, this list of conditions and the following disclaimer in
  21. * the documentation and/or other materials provided with the
  22. * distribution.
  23. * - Neither the name of ARM LIMITED nor the names of its contributors
  24. * may be used to endorse or promote products derived from this
  25. * software without specific prior written permission.
  26. *
  27. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  30. * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  31. * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  32. * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  33. * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  34. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  35. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  36. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  37. * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38. * POSSIBILITY OF SUCH DAMAGE.
  39. * -------------------------------------------------------------------- */
  40. #include "arm_math.h"
  41. /**
  42. * @ingroup groupFilters
  43. */
  44. /**
  45. * @addtogroup FIR_decimate
  46. * @{
  47. */
  48. /**
  49. * @brief Processing function for the Q15 FIR decimator (fast variant) for Cortex-M3 and Cortex-M4.
  50. * @param[in] *S points to an instance of the Q15 FIR decimator structure.
  51. * @param[in] *pSrc points to the block of input data.
  52. * @param[out] *pDst points to the block of output data
  53. * @param[in] blockSize number of input samples to process per call.
  54. * @return none
  55. *
  56. * \par Restrictions
  57. * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
  58. * In this case input, output, state buffers should be aligned by 32-bit
  59. *
  60. * <b>Scaling and Overflow Behavior:</b>
  61. * \par
  62. * This fast version uses a 32-bit accumulator with 2.30 format.
  63. * The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.
  64. * Thus, if the accumulator result overflows it wraps around and distorts the result.
  65. * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits (log2 is read as log to the base 2).
  66. * The 2.30 accumulator is then truncated to 2.15 format and saturated to yield the 1.15 result.
  67. *
  68. * \par
  69. * Refer to the function <code>arm_fir_decimate_q15()</code> for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion.
  70. * Both the slow and the fast versions use the same instance structure.
  71. * Use the function <code>arm_fir_decimate_init_q15()</code> to initialize the filter structure.
  72. */
  73. #ifndef UNALIGNED_SUPPORT_DISABLE
  74. void arm_fir_decimate_fast_q15(
  75. const arm_fir_decimate_instance_q15 * S,
  76. q15_t * pSrc,
  77. q15_t * pDst,
  78. uint32_t blockSize)
  79. {
  80. q15_t *pState = S->pState; /* State pointer */
  81. q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
  82. q15_t *pStateCurnt; /* Points to the current sample of the state */
  83. q15_t *px; /* Temporary pointer for state buffer */
  84. q15_t *pb; /* Temporary pointer coefficient buffer */
  85. q31_t x0, x1, c0, c1; /* Temporary variables to hold state and coefficient values */
  86. q31_t sum0; /* Accumulators */
  87. q31_t acc0, acc1;
  88. q15_t *px0, *px1;
  89. uint32_t blkCntN3;
  90. uint32_t numTaps = S->numTaps; /* Number of taps */
  91. uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M; /* Loop counters */
  92. /* S->pState buffer contains previous frame (numTaps - 1) samples */
  93. /* pStateCurnt points to the location where the new input data should be written */
  94. pStateCurnt = S->pState + (numTaps - 1u);
  95. /* Total number of output samples to be computed */
  96. blkCnt = outBlockSize / 2;
  97. blkCntN3 = outBlockSize - (2 * blkCnt);
  98. while(blkCnt > 0u)
  99. {
  100. /* Copy decimation factor number of new input samples into the state buffer */
  101. i = 2 * S->M;
  102. do
  103. {
  104. *pStateCurnt++ = *pSrc++;
  105. } while(--i);
  106. /* Set accumulator to zero */
  107. acc0 = 0;
  108. acc1 = 0;
  109. /* Initialize state pointer */
  110. px0 = pState;
  111. px1 = pState + S->M;
  112. /* Initialize coeff pointer */
  113. pb = pCoeffs;
  114. /* Loop unrolling. Process 4 taps at a time. */
  115. tapCnt = numTaps >> 2;
  116. /* Loop over the number of taps. Unroll by a factor of 4.
  117. ** Repeat until we've computed numTaps-4 coefficients. */
  118. while(tapCnt > 0u)
  119. {
  120. /* Read the Read b[numTaps-1] and b[numTaps-2] coefficients */
  121. c0 = *__SIMD32(pb)++;
  122. /* Read x[n-numTaps-1] and x[n-numTaps-2]sample */
  123. x0 = *__SIMD32(px0)++;
  124. x1 = *__SIMD32(px1)++;
  125. /* Perform the multiply-accumulate */
  126. acc0 = __SMLAD(x0, c0, acc0);
  127. acc1 = __SMLAD(x1, c0, acc1);
  128. /* Read the b[numTaps-3] and b[numTaps-4] coefficient */
  129. c0 = *__SIMD32(pb)++;
  130. /* Read x[n-numTaps-2] and x[n-numTaps-3] sample */
  131. x0 = *__SIMD32(px0)++;
  132. x1 = *__SIMD32(px1)++;
  133. /* Perform the multiply-accumulate */
  134. acc0 = __SMLAD(x0, c0, acc0);
  135. acc1 = __SMLAD(x1, c0, acc1);
  136. /* Decrement the loop counter */
  137. tapCnt--;
  138. }
  139. /* If the filter length is not a multiple of 4, compute the remaining filter taps */
  140. tapCnt = numTaps % 0x4u;
  141. while(tapCnt > 0u)
  142. {
  143. /* Read coefficients */
  144. c0 = *pb++;
  145. /* Fetch 1 state variable */
  146. x0 = *px0++;
  147. x1 = *px1++;
  148. /* Perform the multiply-accumulate */
  149. acc0 = __SMLAD(x0, c0, acc0);
  150. acc1 = __SMLAD(x1, c0, acc1);
  151. /* Decrement the loop counter */
  152. tapCnt--;
  153. }
  154. /* Advance the state pointer by the decimation factor
  155. * to process the next group of decimation factor number samples */
  156. pState = pState + S->M * 2;
  157. /* Store filter output, smlad returns the values in 2.14 format */
  158. /* so downsacle by 15 to get output in 1.15 */
  159. *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
  160. *pDst++ = (q15_t) (__SSAT((acc1 >> 15), 16));
  161. /* Decrement the loop counter */
  162. blkCnt--;
  163. }
  164. while(blkCntN3 > 0u)
  165. {
  166. /* Copy decimation factor number of new input samples into the state buffer */
  167. i = S->M;
  168. do
  169. {
  170. *pStateCurnt++ = *pSrc++;
  171. } while(--i);
  172. /*Set sum to zero */
  173. sum0 = 0;
  174. /* Initialize state pointer */
  175. px = pState;
  176. /* Initialize coeff pointer */
  177. pb = pCoeffs;
  178. /* Loop unrolling. Process 4 taps at a time. */
  179. tapCnt = numTaps >> 2;
  180. /* Loop over the number of taps. Unroll by a factor of 4.
  181. ** Repeat until we've computed numTaps-4 coefficients. */
  182. while(tapCnt > 0u)
  183. {
  184. /* Read the Read b[numTaps-1] and b[numTaps-2] coefficients */
  185. c0 = *__SIMD32(pb)++;
  186. /* Read x[n-numTaps-1] and x[n-numTaps-2]sample */
  187. x0 = *__SIMD32(px)++;
  188. /* Read the b[numTaps-3] and b[numTaps-4] coefficient */
  189. c1 = *__SIMD32(pb)++;
  190. /* Perform the multiply-accumulate */
  191. sum0 = __SMLAD(x0, c0, sum0);
  192. /* Read x[n-numTaps-2] and x[n-numTaps-3] sample */
  193. x0 = *__SIMD32(px)++;
  194. /* Perform the multiply-accumulate */
  195. sum0 = __SMLAD(x0, c1, sum0);
  196. /* Decrement the loop counter */
  197. tapCnt--;
  198. }
  199. /* If the filter length is not a multiple of 4, compute the remaining filter taps */
  200. tapCnt = numTaps % 0x4u;
  201. while(tapCnt > 0u)
  202. {
  203. /* Read coefficients */
  204. c0 = *pb++;
  205. /* Fetch 1 state variable */
  206. x0 = *px++;
  207. /* Perform the multiply-accumulate */
  208. sum0 = __SMLAD(x0, c0, sum0);
  209. /* Decrement the loop counter */
  210. tapCnt--;
  211. }
  212. /* Advance the state pointer by the decimation factor
  213. * to process the next group of decimation factor number samples */
  214. pState = pState + S->M;
  215. /* Store filter output, smlad returns the values in 2.14 format */
  216. /* so downsacle by 15 to get output in 1.15 */
  217. *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16));
  218. /* Decrement the loop counter */
  219. blkCntN3--;
  220. }
  221. /* Processing is complete.
  222. ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
  223. ** This prepares the state buffer for the next function call. */
  224. /* Points to the start of the state buffer */
  225. pStateCurnt = S->pState;
  226. i = (numTaps - 1u) >> 2u;
  227. /* copy data */
  228. while(i > 0u)
  229. {
  230. *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
  231. *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
  232. /* Decrement the loop counter */
  233. i--;
  234. }
  235. i = (numTaps - 1u) % 0x04u;
  236. /* copy data */
  237. while(i > 0u)
  238. {
  239. *pStateCurnt++ = *pState++;
  240. /* Decrement the loop counter */
  241. i--;
  242. }
  243. }
  244. #else
  245. void arm_fir_decimate_fast_q15(
  246. const arm_fir_decimate_instance_q15 * S,
  247. q15_t * pSrc,
  248. q15_t * pDst,
  249. uint32_t blockSize)
  250. {
  251. q15_t *pState = S->pState; /* State pointer */
  252. q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
  253. q15_t *pStateCurnt; /* Points to the current sample of the state */
  254. q15_t *px; /* Temporary pointer for state buffer */
  255. q15_t *pb; /* Temporary pointer coefficient buffer */
  256. q15_t x0, x1, c0; /* Temporary variables to hold state and coefficient values */
  257. q31_t sum0; /* Accumulators */
  258. q31_t acc0, acc1;
  259. q15_t *px0, *px1;
  260. uint32_t blkCntN3;
  261. uint32_t numTaps = S->numTaps; /* Number of taps */
  262. uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M; /* Loop counters */
  263. /* S->pState buffer contains previous frame (numTaps - 1) samples */
  264. /* pStateCurnt points to the location where the new input data should be written */
  265. pStateCurnt = S->pState + (numTaps - 1u);
  266. /* Total number of output samples to be computed */
  267. blkCnt = outBlockSize / 2;
  268. blkCntN3 = outBlockSize - (2 * blkCnt);
  269. while(blkCnt > 0u)
  270. {
  271. /* Copy decimation factor number of new input samples into the state buffer */
  272. i = 2 * S->M;
  273. do
  274. {
  275. *pStateCurnt++ = *pSrc++;
  276. } while(--i);
  277. /* Set accumulator to zero */
  278. acc0 = 0;
  279. acc1 = 0;
  280. /* Initialize state pointer */
  281. px0 = pState;
  282. px1 = pState + S->M;
  283. /* Initialize coeff pointer */
  284. pb = pCoeffs;
  285. /* Loop unrolling. Process 4 taps at a time. */
  286. tapCnt = numTaps >> 2;
  287. /* Loop over the number of taps. Unroll by a factor of 4.
  288. ** Repeat until we've computed numTaps-4 coefficients. */
  289. while(tapCnt > 0u)
  290. {
  291. /* Read the Read b[numTaps-1] coefficients */
  292. c0 = *pb++;
  293. /* Read x[n-numTaps-1] for sample 0 and for sample 1 */
  294. x0 = *px0++;
  295. x1 = *px1++;
  296. /* Perform the multiply-accumulate */
  297. acc0 += x0 * c0;
  298. acc1 += x1 * c0;
  299. /* Read the b[numTaps-2] coefficient */
  300. c0 = *pb++;
  301. /* Read x[n-numTaps-2] for sample 0 and sample 1 */
  302. x0 = *px0++;
  303. x1 = *px1++;
  304. /* Perform the multiply-accumulate */
  305. acc0 += x0 * c0;
  306. acc1 += x1 * c0;
  307. /* Read the b[numTaps-3] coefficients */
  308. c0 = *pb++;
  309. /* Read x[n-numTaps-3] for sample 0 and sample 1 */
  310. x0 = *px0++;
  311. x1 = *px1++;
  312. /* Perform the multiply-accumulate */
  313. acc0 += x0 * c0;
  314. acc1 += x1 * c0;
  315. /* Read the b[numTaps-4] coefficient */
  316. c0 = *pb++;
  317. /* Read x[n-numTaps-4] for sample 0 and sample 1 */
  318. x0 = *px0++;
  319. x1 = *px1++;
  320. /* Perform the multiply-accumulate */
  321. acc0 += x0 * c0;
  322. acc1 += x1 * c0;
  323. /* Decrement the loop counter */
  324. tapCnt--;
  325. }
  326. /* If the filter length is not a multiple of 4, compute the remaining filter taps */
  327. tapCnt = numTaps % 0x4u;
  328. while(tapCnt > 0u)
  329. {
  330. /* Read coefficients */
  331. c0 = *pb++;
  332. /* Fetch 1 state variable */
  333. x0 = *px0++;
  334. x1 = *px1++;
  335. /* Perform the multiply-accumulate */
  336. acc0 += x0 * c0;
  337. acc1 += x1 * c0;
  338. /* Decrement the loop counter */
  339. tapCnt--;
  340. }
  341. /* Advance the state pointer by the decimation factor
  342. * to process the next group of decimation factor number samples */
  343. pState = pState + S->M * 2;
  344. /* Store filter output, smlad returns the values in 2.14 format */
  345. /* so downsacle by 15 to get output in 1.15 */
  346. *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
  347. *pDst++ = (q15_t) (__SSAT((acc1 >> 15), 16));
  348. /* Decrement the loop counter */
  349. blkCnt--;
  350. }
  351. while(blkCntN3 > 0u)
  352. {
  353. /* Copy decimation factor number of new input samples into the state buffer */
  354. i = S->M;
  355. do
  356. {
  357. *pStateCurnt++ = *pSrc++;
  358. } while(--i);
  359. /*Set sum to zero */
  360. sum0 = 0;
  361. /* Initialize state pointer */
  362. px = pState;
  363. /* Initialize coeff pointer */
  364. pb = pCoeffs;
  365. /* Loop unrolling. Process 4 taps at a time. */
  366. tapCnt = numTaps >> 2;
  367. /* Loop over the number of taps. Unroll by a factor of 4.
  368. ** Repeat until we've computed numTaps-4 coefficients. */
  369. while(tapCnt > 0u)
  370. {
  371. /* Read the Read b[numTaps-1] coefficients */
  372. c0 = *pb++;
  373. /* Read x[n-numTaps-1] and sample */
  374. x0 = *px++;
  375. /* Perform the multiply-accumulate */
  376. sum0 += x0 * c0;
  377. /* Read the b[numTaps-2] coefficient */
  378. c0 = *pb++;
  379. /* Read x[n-numTaps-2] and sample */
  380. x0 = *px++;
  381. /* Perform the multiply-accumulate */
  382. sum0 += x0 * c0;
  383. /* Read the b[numTaps-3] coefficients */
  384. c0 = *pb++;
  385. /* Read x[n-numTaps-3] sample */
  386. x0 = *px++;
  387. /* Perform the multiply-accumulate */
  388. sum0 += x0 * c0;
  389. /* Read the b[numTaps-4] coefficient */
  390. c0 = *pb++;
  391. /* Read x[n-numTaps-4] sample */
  392. x0 = *px++;
  393. /* Perform the multiply-accumulate */
  394. sum0 += x0 * c0;
  395. /* Decrement the loop counter */
  396. tapCnt--;
  397. }
  398. /* If the filter length is not a multiple of 4, compute the remaining filter taps */
  399. tapCnt = numTaps % 0x4u;
  400. while(tapCnt > 0u)
  401. {
  402. /* Read coefficients */
  403. c0 = *pb++;
  404. /* Fetch 1 state variable */
  405. x0 = *px++;
  406. /* Perform the multiply-accumulate */
  407. sum0 += x0 * c0;
  408. /* Decrement the loop counter */
  409. tapCnt--;
  410. }
  411. /* Advance the state pointer by the decimation factor
  412. * to process the next group of decimation factor number samples */
  413. pState = pState + S->M;
  414. /* Store filter output, smlad returns the values in 2.14 format */
  415. /* so downsacle by 15 to get output in 1.15 */
  416. *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16));
  417. /* Decrement the loop counter */
  418. blkCntN3--;
  419. }
  420. /* Processing is complete.
  421. ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
  422. ** This prepares the state buffer for the next function call. */
  423. /* Points to the start of the state buffer */
  424. pStateCurnt = S->pState;
  425. i = (numTaps - 1u) >> 2u;
  426. /* copy data */
  427. while(i > 0u)
  428. {
  429. *pStateCurnt++ = *pState++;
  430. *pStateCurnt++ = *pState++;
  431. *pStateCurnt++ = *pState++;
  432. *pStateCurnt++ = *pState++;
  433. /* Decrement the loop counter */
  434. i--;
  435. }
  436. i = (numTaps - 1u) % 0x04u;
  437. /* copy data */
  438. while(i > 0u)
  439. {
  440. *pStateCurnt++ = *pState++;
  441. /* Decrement the loop counter */
  442. i--;
  443. }
  444. }
  445. #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
  446. /**
  447. * @} end of FIR_decimate group
  448. */