Keyboard firmwares for Atmel AVR and Cortex-M
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

arm_fir_sparse_q31.c 11KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375
  1. /* ----------------------------------------------------------------------
  2. * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
  3. *
  4. * $Date: 17. January 2013
  5. * $Revision: V1.4.1
  6. *
  7. * Project: CMSIS DSP Library
  8. * Title: arm_fir_sparse_q31.c
  9. *
  10. * Description: Q31 sparse FIR filter processing function.
  11. *
  12. * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
  13. *
  14. * Redistribution and use in source and binary forms, with or without
  15. * modification, are permitted provided that the following conditions
  16. * are met:
  17. * - Redistributions of source code must retain the above copyright
  18. * notice, this list of conditions and the following disclaimer.
  19. * - Redistributions in binary form must reproduce the above copyright
  20. * notice, this list of conditions and the following disclaimer in
  21. * the documentation and/or other materials provided with the
  22. * distribution.
  23. * - Neither the name of ARM LIMITED nor the names of its contributors
  24. * may be used to endorse or promote products derived from this
  25. * software without specific prior written permission.
  26. *
  27. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  30. * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  31. * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  32. * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  33. * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  34. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  35. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  36. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  37. * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38. * POSSIBILITY OF SUCH DAMAGE.
  39. * ------------------------------------------------------------------- */
  40. #include "arm_math.h"
  41. /**
  42. * @addtogroup FIR_Sparse
  43. * @{
  44. */
  45. /**
  46. * @brief Processing function for the Q31 sparse FIR filter.
  47. * @param[in] *S points to an instance of the Q31 sparse FIR structure.
  48. * @param[in] *pSrc points to the block of input data.
  49. * @param[out] *pDst points to the block of output data
  50. * @param[in] *pScratchIn points to a temporary buffer of size blockSize.
  51. * @param[in] blockSize number of input samples to process per call.
  52. * @return none.
  53. *
  54. * <b>Scaling and Overflow Behavior:</b>
  55. * \par
  56. * The function is implemented using an internal 32-bit accumulator.
  57. * The 1.31 x 1.31 multiplications are truncated to 2.30 format.
  58. * This leads to loss of precision on the intermediate multiplications and provides only a single guard bit.
  59. * If the accumulator result overflows, it wraps around rather than saturate.
  60. * In order to avoid overflows the input signal or coefficients must be scaled down by log2(numTaps) bits.
  61. */
  62. void arm_fir_sparse_q31(
  63. arm_fir_sparse_instance_q31 * S,
  64. q31_t * pSrc,
  65. q31_t * pDst,
  66. q31_t * pScratchIn,
  67. uint32_t blockSize)
  68. {
  69. q31_t *pState = S->pState; /* State pointer */
  70. q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
  71. q31_t *px; /* Scratch buffer pointer */
  72. q31_t *py = pState; /* Temporary pointers for state buffer */
  73. q31_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */
  74. q31_t *pOut; /* Destination pointer */
  75. q63_t out; /* Temporary output variable */
  76. int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */
  77. uint32_t delaySize = S->maxDelay + blockSize; /* state length */
  78. uint16_t numTaps = S->numTaps; /* Filter order */
  79. int32_t readIndex; /* Read index of the state buffer */
  80. uint32_t tapCnt, blkCnt; /* loop counters */
  81. q31_t coeff = *pCoeffs++; /* Read the first coefficient value */
  82. q31_t in;
  83. /* BlockSize of Input samples are copied into the state buffer */
  84. /* StateIndex points to the starting position to write in the state buffer */
  85. arm_circularWrite_f32((int32_t *) py, delaySize, &S->stateIndex, 1,
  86. (int32_t *) pSrc, 1, blockSize);
  87. /* Read Index, from where the state buffer should be read, is calculated. */
  88. readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
  89. /* Wraparound of readIndex */
  90. if(readIndex < 0)
  91. {
  92. readIndex += (int32_t) delaySize;
  93. }
  94. /* Working pointer for state buffer is updated */
  95. py = pState;
  96. /* blockSize samples are read from the state buffer */
  97. arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
  98. (int32_t *) pb, (int32_t *) pb, blockSize, 1,
  99. blockSize);
  100. /* Working pointer for the scratch buffer of state values */
  101. px = pb;
  102. /* Working pointer for scratch buffer of output values */
  103. pOut = pDst;
  104. #ifndef ARM_MATH_CM0_FAMILY
  105. /* Run the below code for Cortex-M4 and Cortex-M3 */
  106. /* Loop over the blockSize. Unroll by a factor of 4.
  107. * Compute 4 Multiplications at a time. */
  108. blkCnt = blockSize >> 2;
  109. while(blkCnt > 0u)
  110. {
  111. /* Perform Multiplications and store in the destination buffer */
  112. *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32);
  113. *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32);
  114. *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32);
  115. *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32);
  116. /* Decrement the loop counter */
  117. blkCnt--;
  118. }
  119. /* If the blockSize is not a multiple of 4,
  120. * compute the remaining samples */
  121. blkCnt = blockSize % 0x4u;
  122. while(blkCnt > 0u)
  123. {
  124. /* Perform Multiplications and store in the destination buffer */
  125. *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32);
  126. /* Decrement the loop counter */
  127. blkCnt--;
  128. }
  129. /* Load the coefficient value and
  130. * increment the coefficient buffer for the next set of state values */
  131. coeff = *pCoeffs++;
  132. /* Read Index, from where the state buffer should be read, is calculated. */
  133. readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
  134. /* Wraparound of readIndex */
  135. if(readIndex < 0)
  136. {
  137. readIndex += (int32_t) delaySize;
  138. }
  139. /* Loop over the number of taps. */
  140. tapCnt = (uint32_t) numTaps - 1u;
  141. while(tapCnt > 0u)
  142. {
  143. /* Working pointer for state buffer is updated */
  144. py = pState;
  145. /* blockSize samples are read from the state buffer */
  146. arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
  147. (int32_t *) pb, (int32_t *) pb, blockSize, 1,
  148. blockSize);
  149. /* Working pointer for the scratch buffer of state values */
  150. px = pb;
  151. /* Working pointer for scratch buffer of output values */
  152. pOut = pDst;
  153. /* Loop over the blockSize. Unroll by a factor of 4.
  154. * Compute 4 MACS at a time. */
  155. blkCnt = blockSize >> 2;
  156. while(blkCnt > 0u)
  157. {
  158. out = *pOut;
  159. out += ((q63_t) * px++ * coeff) >> 32;
  160. *pOut++ = (q31_t) (out);
  161. out = *pOut;
  162. out += ((q63_t) * px++ * coeff) >> 32;
  163. *pOut++ = (q31_t) (out);
  164. out = *pOut;
  165. out += ((q63_t) * px++ * coeff) >> 32;
  166. *pOut++ = (q31_t) (out);
  167. out = *pOut;
  168. out += ((q63_t) * px++ * coeff) >> 32;
  169. *pOut++ = (q31_t) (out);
  170. /* Decrement the loop counter */
  171. blkCnt--;
  172. }
  173. /* If the blockSize is not a multiple of 4,
  174. * compute the remaining samples */
  175. blkCnt = blockSize % 0x4u;
  176. while(blkCnt > 0u)
  177. {
  178. /* Perform Multiply-Accumulate */
  179. out = *pOut;
  180. out += ((q63_t) * px++ * coeff) >> 32;
  181. *pOut++ = (q31_t) (out);
  182. /* Decrement the loop counter */
  183. blkCnt--;
  184. }
  185. /* Load the coefficient value and
  186. * increment the coefficient buffer for the next set of state values */
  187. coeff = *pCoeffs++;
  188. /* Read Index, from where the state buffer should be read, is calculated. */
  189. readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
  190. /* Wraparound of readIndex */
  191. if(readIndex < 0)
  192. {
  193. readIndex += (int32_t) delaySize;
  194. }
  195. /* Decrement the tap loop counter */
  196. tapCnt--;
  197. }
  198. /* Working output pointer is updated */
  199. pOut = pDst;
  200. /* Output is converted into 1.31 format. */
  201. /* Loop over the blockSize. Unroll by a factor of 4.
  202. * process 4 output samples at a time. */
  203. blkCnt = blockSize >> 2;
  204. while(blkCnt > 0u)
  205. {
  206. in = *pOut << 1;
  207. *pOut++ = in;
  208. in = *pOut << 1;
  209. *pOut++ = in;
  210. in = *pOut << 1;
  211. *pOut++ = in;
  212. in = *pOut << 1;
  213. *pOut++ = in;
  214. /* Decrement the loop counter */
  215. blkCnt--;
  216. }
  217. /* If the blockSize is not a multiple of 4,
  218. * process the remaining output samples */
  219. blkCnt = blockSize % 0x4u;
  220. while(blkCnt > 0u)
  221. {
  222. in = *pOut << 1;
  223. *pOut++ = in;
  224. /* Decrement the loop counter */
  225. blkCnt--;
  226. }
  227. #else
  228. /* Run the below code for Cortex-M0 */
  229. blkCnt = blockSize;
  230. while(blkCnt > 0u)
  231. {
  232. /* Perform Multiplications and store in the destination buffer */
  233. *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32);
  234. /* Decrement the loop counter */
  235. blkCnt--;
  236. }
  237. /* Load the coefficient value and
  238. * increment the coefficient buffer for the next set of state values */
  239. coeff = *pCoeffs++;
  240. /* Read Index, from where the state buffer should be read, is calculated. */
  241. readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
  242. /* Wraparound of readIndex */
  243. if(readIndex < 0)
  244. {
  245. readIndex += (int32_t) delaySize;
  246. }
  247. /* Loop over the number of taps. */
  248. tapCnt = (uint32_t) numTaps - 1u;
  249. while(tapCnt > 0u)
  250. {
  251. /* Working pointer for state buffer is updated */
  252. py = pState;
  253. /* blockSize samples are read from the state buffer */
  254. arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
  255. (int32_t *) pb, (int32_t *) pb, blockSize, 1,
  256. blockSize);
  257. /* Working pointer for the scratch buffer of state values */
  258. px = pb;
  259. /* Working pointer for scratch buffer of output values */
  260. pOut = pDst;
  261. blkCnt = blockSize;
  262. while(blkCnt > 0u)
  263. {
  264. /* Perform Multiply-Accumulate */
  265. out = *pOut;
  266. out += ((q63_t) * px++ * coeff) >> 32;
  267. *pOut++ = (q31_t) (out);
  268. /* Decrement the loop counter */
  269. blkCnt--;
  270. }
  271. /* Load the coefficient value and
  272. * increment the coefficient buffer for the next set of state values */
  273. coeff = *pCoeffs++;
  274. /* Read Index, from where the state buffer should be read, is calculated. */
  275. readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
  276. /* Wraparound of readIndex */
  277. if(readIndex < 0)
  278. {
  279. readIndex += (int32_t) delaySize;
  280. }
  281. /* Decrement the tap loop counter */
  282. tapCnt--;
  283. }
  284. /* Working output pointer is updated */
  285. pOut = pDst;
  286. /* Output is converted into 1.31 format. */
  287. blkCnt = blockSize;
  288. while(blkCnt > 0u)
  289. {
  290. in = *pOut << 1;
  291. *pOut++ = in;
  292. /* Decrement the loop counter */
  293. blkCnt--;
  294. }
  295. #endif /* #ifndef ARM_MATH_CM0_FAMILY */
  296. }
  297. /**
  298. * @} end of FIR_Sparse group
  299. */