Keyboard firmwares for Atmel AVR and Cortex-M
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

arm_fir_sparse_q15.c 13KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411
  1. /* ----------------------------------------------------------------------
  2. * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
  3. *
  4. * $Date: 17. January 2013
  5. * $Revision: V1.4.1
  6. *
  7. * Project: CMSIS DSP Library
  8. * Title: arm_fir_sparse_q15.c
  9. *
  10. * Description: Q15 sparse FIR filter processing function.
  11. *
  12. * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
  13. *
  14. * Redistribution and use in source and binary forms, with or without
  15. * modification, are permitted provided that the following conditions
  16. * are met:
  17. * - Redistributions of source code must retain the above copyright
  18. * notice, this list of conditions and the following disclaimer.
  19. * - Redistributions in binary form must reproduce the above copyright
  20. * notice, this list of conditions and the following disclaimer in
  21. * the documentation and/or other materials provided with the
  22. * distribution.
  23. * - Neither the name of ARM LIMITED nor the names of its contributors
  24. * may be used to endorse or promote products derived from this
  25. * software without specific prior written permission.
  26. *
  27. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  30. * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  31. * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  32. * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  33. * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  34. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  35. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  36. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  37. * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38. * POSSIBILITY OF SUCH DAMAGE.
  39. * ------------------------------------------------------------------- */
  40. #include "arm_math.h"
  41. /**
  42. * @addtogroup FIR_Sparse
  43. * @{
  44. */
  45. /**
  46. * @brief Processing function for the Q15 sparse FIR filter.
  47. * @param[in] *S points to an instance of the Q15 sparse FIR structure.
  48. * @param[in] *pSrc points to the block of input data.
  49. * @param[out] *pDst points to the block of output data
  50. * @param[in] *pScratchIn points to a temporary buffer of size blockSize.
  51. * @param[in] *pScratchOut points to a temporary buffer of size blockSize.
  52. * @param[in] blockSize number of input samples to process per call.
  53. * @return none.
  54. *
  55. * <b>Scaling and Overflow Behavior:</b>
  56. * \par
  57. * The function is implemented using an internal 32-bit accumulator.
  58. * The 1.15 x 1.15 multiplications yield a 2.30 result and these are added to a 2.30 accumulator.
  59. * Thus the full precision of the multiplications is maintained but there is only a single guard bit in the accumulator.
  60. * If the accumulator result overflows it will wrap around rather than saturate.
  61. * After all multiply-accumulates are performed, the 2.30 accumulator is truncated to 2.15 format and then saturated to 1.15 format.
  62. * In order to avoid overflows the input signal or coefficients must be scaled down by log2(numTaps) bits.
  63. */
  64. void arm_fir_sparse_q15(
  65. arm_fir_sparse_instance_q15 * S,
  66. q15_t * pSrc,
  67. q15_t * pDst,
  68. q15_t * pScratchIn,
  69. q31_t * pScratchOut,
  70. uint32_t blockSize)
  71. {
  72. q15_t *pState = S->pState; /* State pointer */
  73. q15_t *pIn = pSrc; /* Working pointer for input */
  74. q15_t *pOut = pDst; /* Working pointer for output */
  75. q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
  76. q15_t *px; /* Temporary pointers for scratch buffer */
  77. q15_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */
  78. q15_t *py = pState; /* Temporary pointers for state buffer */
  79. int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */
  80. uint32_t delaySize = S->maxDelay + blockSize; /* state length */
  81. uint16_t numTaps = S->numTaps; /* Filter order */
  82. int32_t readIndex; /* Read index of the state buffer */
  83. uint32_t tapCnt, blkCnt; /* loop counters */
  84. q15_t coeff = *pCoeffs++; /* Read the first coefficient value */
  85. q31_t *pScr2 = pScratchOut; /* Working pointer for pScratchOut */
  86. #ifndef ARM_MATH_CM0_FAMILY
  87. /* Run the below code for Cortex-M4 and Cortex-M3 */
  88. q31_t in1, in2; /* Temporary variables */
  89. /* BlockSize of Input samples are copied into the state buffer */
  90. /* StateIndex points to the starting position to write in the state buffer */
  91. arm_circularWrite_q15(py, delaySize, &S->stateIndex, 1, pIn, 1, blockSize);
  92. /* Loop over the number of taps. */
  93. tapCnt = numTaps;
  94. /* Read Index, from where the state buffer should be read, is calculated. */
  95. readIndex = (S->stateIndex - blockSize) - *pTapDelay++;
  96. /* Wraparound of readIndex */
  97. if(readIndex < 0)
  98. {
  99. readIndex += (int32_t) delaySize;
  100. }
  101. /* Working pointer for state buffer is updated */
  102. py = pState;
  103. /* blockSize samples are read from the state buffer */
  104. arm_circularRead_q15(py, delaySize, &readIndex, 1,
  105. pb, pb, blockSize, 1, blockSize);
  106. /* Working pointer for the scratch buffer of state values */
  107. px = pb;
  108. /* Working pointer for scratch buffer of output values */
  109. pScratchOut = pScr2;
  110. /* Loop over the blockSize. Unroll by a factor of 4.
  111. * Compute 4 multiplications at a time. */
  112. blkCnt = blockSize >> 2;
  113. while(blkCnt > 0u)
  114. {
  115. /* Perform multiplication and store in the scratch buffer */
  116. *pScratchOut++ = ((q31_t) * px++ * coeff);
  117. *pScratchOut++ = ((q31_t) * px++ * coeff);
  118. *pScratchOut++ = ((q31_t) * px++ * coeff);
  119. *pScratchOut++ = ((q31_t) * px++ * coeff);
  120. /* Decrement the loop counter */
  121. blkCnt--;
  122. }
  123. /* If the blockSize is not a multiple of 4,
  124. * compute the remaining samples */
  125. blkCnt = blockSize % 0x4u;
  126. while(blkCnt > 0u)
  127. {
  128. /* Perform multiplication and store in the scratch buffer */
  129. *pScratchOut++ = ((q31_t) * px++ * coeff);
  130. /* Decrement the loop counter */
  131. blkCnt--;
  132. }
  133. /* Load the coefficient value and
  134. * increment the coefficient buffer for the next set of state values */
  135. coeff = *pCoeffs++;
  136. /* Read Index, from where the state buffer should be read, is calculated. */
  137. readIndex = (S->stateIndex - blockSize) - *pTapDelay++;
  138. /* Wraparound of readIndex */
  139. if(readIndex < 0)
  140. {
  141. readIndex += (int32_t) delaySize;
  142. }
  143. /* Loop over the number of taps. */
  144. tapCnt = (uint32_t) numTaps - 1u;
  145. while(tapCnt > 0u)
  146. {
  147. /* Working pointer for state buffer is updated */
  148. py = pState;
  149. /* blockSize samples are read from the state buffer */
  150. arm_circularRead_q15(py, delaySize, &readIndex, 1,
  151. pb, pb, blockSize, 1, blockSize);
  152. /* Working pointer for the scratch buffer of state values */
  153. px = pb;
  154. /* Working pointer for scratch buffer of output values */
  155. pScratchOut = pScr2;
  156. /* Loop over the blockSize. Unroll by a factor of 4.
  157. * Compute 4 MACS at a time. */
  158. blkCnt = blockSize >> 2;
  159. while(blkCnt > 0u)
  160. {
  161. /* Perform Multiply-Accumulate */
  162. *pScratchOut++ += (q31_t) * px++ * coeff;
  163. *pScratchOut++ += (q31_t) * px++ * coeff;
  164. *pScratchOut++ += (q31_t) * px++ * coeff;
  165. *pScratchOut++ += (q31_t) * px++ * coeff;
  166. /* Decrement the loop counter */
  167. blkCnt--;
  168. }
  169. /* If the blockSize is not a multiple of 4,
  170. * compute the remaining samples */
  171. blkCnt = blockSize % 0x4u;
  172. while(blkCnt > 0u)
  173. {
  174. /* Perform Multiply-Accumulate */
  175. *pScratchOut++ += (q31_t) * px++ * coeff;
  176. /* Decrement the loop counter */
  177. blkCnt--;
  178. }
  179. /* Load the coefficient value and
  180. * increment the coefficient buffer for the next set of state values */
  181. coeff = *pCoeffs++;
  182. /* Read Index, from where the state buffer should be read, is calculated. */
  183. readIndex = (S->stateIndex - blockSize) - *pTapDelay++;
  184. /* Wraparound of readIndex */
  185. if(readIndex < 0)
  186. {
  187. readIndex += (int32_t) delaySize;
  188. }
  189. /* Decrement the tap loop counter */
  190. tapCnt--;
  191. }
  192. /* All the output values are in pScratchOut buffer.
  193. Convert them into 1.15 format, saturate and store in the destination buffer. */
  194. /* Loop over the blockSize. */
  195. blkCnt = blockSize >> 2;
  196. while(blkCnt > 0u)
  197. {
  198. in1 = *pScr2++;
  199. in2 = *pScr2++;
  200. #ifndef ARM_MATH_BIG_ENDIAN
  201. *__SIMD32(pOut)++ =
  202. __PKHBT((q15_t) __SSAT(in1 >> 15, 16), (q15_t) __SSAT(in2 >> 15, 16),
  203. 16);
  204. #else
  205. *__SIMD32(pOut)++ =
  206. __PKHBT((q15_t) __SSAT(in2 >> 15, 16), (q15_t) __SSAT(in1 >> 15, 16),
  207. 16);
  208. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  209. in1 = *pScr2++;
  210. in2 = *pScr2++;
  211. #ifndef ARM_MATH_BIG_ENDIAN
  212. *__SIMD32(pOut)++ =
  213. __PKHBT((q15_t) __SSAT(in1 >> 15, 16), (q15_t) __SSAT(in2 >> 15, 16),
  214. 16);
  215. #else
  216. *__SIMD32(pOut)++ =
  217. __PKHBT((q15_t) __SSAT(in2 >> 15, 16), (q15_t) __SSAT(in1 >> 15, 16),
  218. 16);
  219. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  220. blkCnt--;
  221. }
  222. /* If the blockSize is not a multiple of 4,
  223. remaining samples are processed in the below loop */
  224. blkCnt = blockSize % 0x4u;
  225. while(blkCnt > 0u)
  226. {
  227. *pOut++ = (q15_t) __SSAT(*pScr2++ >> 15, 16);
  228. blkCnt--;
  229. }
  230. #else
  231. /* Run the below code for Cortex-M0 */
  232. /* BlockSize of Input samples are copied into the state buffer */
  233. /* StateIndex points to the starting position to write in the state buffer */
  234. arm_circularWrite_q15(py, delaySize, &S->stateIndex, 1, pIn, 1, blockSize);
  235. /* Loop over the number of taps. */
  236. tapCnt = numTaps;
  237. /* Read Index, from where the state buffer should be read, is calculated. */
  238. readIndex = (S->stateIndex - blockSize) - *pTapDelay++;
  239. /* Wraparound of readIndex */
  240. if(readIndex < 0)
  241. {
  242. readIndex += (int32_t) delaySize;
  243. }
  244. /* Working pointer for state buffer is updated */
  245. py = pState;
  246. /* blockSize samples are read from the state buffer */
  247. arm_circularRead_q15(py, delaySize, &readIndex, 1,
  248. pb, pb, blockSize, 1, blockSize);
  249. /* Working pointer for the scratch buffer of state values */
  250. px = pb;
  251. /* Working pointer for scratch buffer of output values */
  252. pScratchOut = pScr2;
  253. blkCnt = blockSize;
  254. while(blkCnt > 0u)
  255. {
  256. /* Perform multiplication and store in the scratch buffer */
  257. *pScratchOut++ = ((q31_t) * px++ * coeff);
  258. /* Decrement the loop counter */
  259. blkCnt--;
  260. }
  261. /* Load the coefficient value and
  262. * increment the coefficient buffer for the next set of state values */
  263. coeff = *pCoeffs++;
  264. /* Read Index, from where the state buffer should be read, is calculated. */
  265. readIndex = (S->stateIndex - blockSize) - *pTapDelay++;
  266. /* Wraparound of readIndex */
  267. if(readIndex < 0)
  268. {
  269. readIndex += (int32_t) delaySize;
  270. }
  271. /* Loop over the number of taps. */
  272. tapCnt = (uint32_t) numTaps - 1u;
  273. while(tapCnt > 0u)
  274. {
  275. /* Working pointer for state buffer is updated */
  276. py = pState;
  277. /* blockSize samples are read from the state buffer */
  278. arm_circularRead_q15(py, delaySize, &readIndex, 1,
  279. pb, pb, blockSize, 1, blockSize);
  280. /* Working pointer for the scratch buffer of state values */
  281. px = pb;
  282. /* Working pointer for scratch buffer of output values */
  283. pScratchOut = pScr2;
  284. blkCnt = blockSize;
  285. while(blkCnt > 0u)
  286. {
  287. /* Perform Multiply-Accumulate */
  288. *pScratchOut++ += (q31_t) * px++ * coeff;
  289. /* Decrement the loop counter */
  290. blkCnt--;
  291. }
  292. /* Load the coefficient value and
  293. * increment the coefficient buffer for the next set of state values */
  294. coeff = *pCoeffs++;
  295. /* Read Index, from where the state buffer should be read, is calculated. */
  296. readIndex = (S->stateIndex - blockSize) - *pTapDelay++;
  297. /* Wraparound of readIndex */
  298. if(readIndex < 0)
  299. {
  300. readIndex += (int32_t) delaySize;
  301. }
  302. /* Decrement the tap loop counter */
  303. tapCnt--;
  304. }
  305. /* All the output values are in pScratchOut buffer.
  306. Convert them into 1.15 format, saturate and store in the destination buffer. */
  307. /* Loop over the blockSize. */
  308. blkCnt = blockSize;
  309. while(blkCnt > 0u)
  310. {
  311. *pOut++ = (q15_t) __SSAT(*pScr2++ >> 15, 16);
  312. blkCnt--;
  313. }
  314. #endif /* #ifndef ARM_MATH_CM0_FAMILY */
  315. }
  316. /**
  317. * @} end of FIR_Sparse group
  318. */