Keyboard firmwares for Atmel AVR and Cortex-M
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

arm_fir_sparse_q7.c 13KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403
  1. /* ----------------------------------------------------------------------
  2. * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
  3. *
  4. * $Date: 17. January 2013
  5. * $Revision: V1.4.1
  6. *
  7. * Project: CMSIS DSP Library
  8. * Title: arm_fir_sparse_q7.c
  9. *
  10. * Description: Q7 sparse FIR filter processing function.
  11. *
  12. * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
  13. *
  14. * Redistribution and use in source and binary forms, with or without
  15. * modification, are permitted provided that the following conditions
  16. * are met:
  17. * - Redistributions of source code must retain the above copyright
  18. * notice, this list of conditions and the following disclaimer.
  19. * - Redistributions in binary form must reproduce the above copyright
  20. * notice, this list of conditions and the following disclaimer in
  21. * the documentation and/or other materials provided with the
  22. * distribution.
  23. * - Neither the name of ARM LIMITED nor the names of its contributors
  24. * may be used to endorse or promote products derived from this
  25. * software without specific prior written permission.
  26. *
  27. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  30. * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  31. * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  32. * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  33. * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  34. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  35. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  36. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  37. * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38. * POSSIBILITY OF SUCH DAMAGE.
  39. * ------------------------------------------------------------------- */
  40. #include "arm_math.h"
  41. /**
  42. * @ingroup groupFilters
  43. */
  44. /**
  45. * @addtogroup FIR_Sparse
  46. * @{
  47. */
  48. /**
  49. * @brief Processing function for the Q7 sparse FIR filter.
  50. * @param[in] *S points to an instance of the Q7 sparse FIR structure.
  51. * @param[in] *pSrc points to the block of input data.
  52. * @param[out] *pDst points to the block of output data
  53. * @param[in] *pScratchIn points to a temporary buffer of size blockSize.
  54. * @param[in] *pScratchOut points to a temporary buffer of size blockSize.
  55. * @param[in] blockSize number of input samples to process per call.
  56. * @return none.
  57. *
  58. * <b>Scaling and Overflow Behavior:</b>
  59. * \par
  60. * The function is implemented using a 32-bit internal accumulator.
  61. * Both coefficients and state variables are represented in 1.7 format and multiplications yield a 2.14 result.
  62. * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
  63. * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
  64. * The accumulator is then converted to 18.7 format by discarding the low 7 bits.
  65. * Finally, the result is truncated to 1.7 format.
  66. */
  67. void arm_fir_sparse_q7(
  68. arm_fir_sparse_instance_q7 * S,
  69. q7_t * pSrc,
  70. q7_t * pDst,
  71. q7_t * pScratchIn,
  72. q31_t * pScratchOut,
  73. uint32_t blockSize)
  74. {
  75. q7_t *pState = S->pState; /* State pointer */
  76. q7_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
  77. q7_t *px; /* Scratch buffer pointer */
  78. q7_t *py = pState; /* Temporary pointers for state buffer */
  79. q7_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */
  80. q7_t *pOut = pDst; /* Destination pointer */
  81. int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */
  82. uint32_t delaySize = S->maxDelay + blockSize; /* state length */
  83. uint16_t numTaps = S->numTaps; /* Filter order */
  84. int32_t readIndex; /* Read index of the state buffer */
  85. uint32_t tapCnt, blkCnt; /* loop counters */
  86. q7_t coeff = *pCoeffs++; /* Read the coefficient value */
  87. q31_t *pScr2 = pScratchOut; /* Working pointer for scratch buffer of output values */
  88. q31_t in;
  89. #ifndef ARM_MATH_CM0_FAMILY
  90. /* Run the below code for Cortex-M4 and Cortex-M3 */
  91. q7_t in1, in2, in3, in4;
  92. /* BlockSize of Input samples are copied into the state buffer */
  93. /* StateIndex points to the starting position to write in the state buffer */
  94. arm_circularWrite_q7(py, (int32_t) delaySize, &S->stateIndex, 1, pSrc, 1,
  95. blockSize);
  96. /* Loop over the number of taps. */
  97. tapCnt = numTaps;
  98. /* Read Index, from where the state buffer should be read, is calculated. */
  99. readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
  100. /* Wraparound of readIndex */
  101. if(readIndex < 0)
  102. {
  103. readIndex += (int32_t) delaySize;
  104. }
  105. /* Working pointer for state buffer is updated */
  106. py = pState;
  107. /* blockSize samples are read from the state buffer */
  108. arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb,
  109. (int32_t) blockSize, 1, blockSize);
  110. /* Working pointer for the scratch buffer of state values */
  111. px = pb;
  112. /* Working pointer for scratch buffer of output values */
  113. pScratchOut = pScr2;
  114. /* Loop over the blockSize. Unroll by a factor of 4.
  115. * Compute 4 multiplications at a time. */
  116. blkCnt = blockSize >> 2;
  117. while(blkCnt > 0u)
  118. {
  119. /* Perform multiplication and store in the scratch buffer */
  120. *pScratchOut++ = ((q31_t) * px++ * coeff);
  121. *pScratchOut++ = ((q31_t) * px++ * coeff);
  122. *pScratchOut++ = ((q31_t) * px++ * coeff);
  123. *pScratchOut++ = ((q31_t) * px++ * coeff);
  124. /* Decrement the loop counter */
  125. blkCnt--;
  126. }
  127. /* If the blockSize is not a multiple of 4,
  128. * compute the remaining samples */
  129. blkCnt = blockSize % 0x4u;
  130. while(blkCnt > 0u)
  131. {
  132. /* Perform multiplication and store in the scratch buffer */
  133. *pScratchOut++ = ((q31_t) * px++ * coeff);
  134. /* Decrement the loop counter */
  135. blkCnt--;
  136. }
  137. /* Load the coefficient value and
  138. * increment the coefficient buffer for the next set of state values */
  139. coeff = *pCoeffs++;
  140. /* Read Index, from where the state buffer should be read, is calculated. */
  141. readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
  142. /* Wraparound of readIndex */
  143. if(readIndex < 0)
  144. {
  145. readIndex += (int32_t) delaySize;
  146. }
  147. /* Loop over the number of taps. */
  148. tapCnt = (uint32_t) numTaps - 1u;
  149. while(tapCnt > 0u)
  150. {
  151. /* Working pointer for state buffer is updated */
  152. py = pState;
  153. /* blockSize samples are read from the state buffer */
  154. arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb,
  155. (int32_t) blockSize, 1, blockSize);
  156. /* Working pointer for the scratch buffer of state values */
  157. px = pb;
  158. /* Working pointer for scratch buffer of output values */
  159. pScratchOut = pScr2;
  160. /* Loop over the blockSize. Unroll by a factor of 4.
  161. * Compute 4 MACS at a time. */
  162. blkCnt = blockSize >> 2;
  163. while(blkCnt > 0u)
  164. {
  165. /* Perform Multiply-Accumulate */
  166. in = *pScratchOut + ((q31_t) * px++ * coeff);
  167. *pScratchOut++ = in;
  168. in = *pScratchOut + ((q31_t) * px++ * coeff);
  169. *pScratchOut++ = in;
  170. in = *pScratchOut + ((q31_t) * px++ * coeff);
  171. *pScratchOut++ = in;
  172. in = *pScratchOut + ((q31_t) * px++ * coeff);
  173. *pScratchOut++ = in;
  174. /* Decrement the loop counter */
  175. blkCnt--;
  176. }
  177. /* If the blockSize is not a multiple of 4,
  178. * compute the remaining samples */
  179. blkCnt = blockSize % 0x4u;
  180. while(blkCnt > 0u)
  181. {
  182. /* Perform Multiply-Accumulate */
  183. in = *pScratchOut + ((q31_t) * px++ * coeff);
  184. *pScratchOut++ = in;
  185. /* Decrement the loop counter */
  186. blkCnt--;
  187. }
  188. /* Load the coefficient value and
  189. * increment the coefficient buffer for the next set of state values */
  190. coeff = *pCoeffs++;
  191. /* Read Index, from where the state buffer should be read, is calculated. */
  192. readIndex = ((int32_t) S->stateIndex -
  193. (int32_t) blockSize) - *pTapDelay++;
  194. /* Wraparound of readIndex */
  195. if(readIndex < 0)
  196. {
  197. readIndex += (int32_t) delaySize;
  198. }
  199. /* Decrement the tap loop counter */
  200. tapCnt--;
  201. }
  202. /* All the output values are in pScratchOut buffer.
  203. Convert them into 1.15 format, saturate and store in the destination buffer. */
  204. /* Loop over the blockSize. */
  205. blkCnt = blockSize >> 2;
  206. while(blkCnt > 0u)
  207. {
  208. in1 = (q7_t) __SSAT(*pScr2++ >> 7, 8);
  209. in2 = (q7_t) __SSAT(*pScr2++ >> 7, 8);
  210. in3 = (q7_t) __SSAT(*pScr2++ >> 7, 8);
  211. in4 = (q7_t) __SSAT(*pScr2++ >> 7, 8);
  212. *__SIMD32(pOut)++ = __PACKq7(in1, in2, in3, in4);
  213. /* Decrement the blockSize loop counter */
  214. blkCnt--;
  215. }
  216. /* If the blockSize is not a multiple of 4,
  217. remaining samples are processed in the below loop */
  218. blkCnt = blockSize % 0x4u;
  219. while(blkCnt > 0u)
  220. {
  221. *pOut++ = (q7_t) __SSAT(*pScr2++ >> 7, 8);
  222. /* Decrement the blockSize loop counter */
  223. blkCnt--;
  224. }
  225. #else
  226. /* Run the below code for Cortex-M0 */
  227. /* BlockSize of Input samples are copied into the state buffer */
  228. /* StateIndex points to the starting position to write in the state buffer */
  229. arm_circularWrite_q7(py, (int32_t) delaySize, &S->stateIndex, 1, pSrc, 1,
  230. blockSize);
  231. /* Loop over the number of taps. */
  232. tapCnt = numTaps;
  233. /* Read Index, from where the state buffer should be read, is calculated. */
  234. readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
  235. /* Wraparound of readIndex */
  236. if(readIndex < 0)
  237. {
  238. readIndex += (int32_t) delaySize;
  239. }
  240. /* Working pointer for state buffer is updated */
  241. py = pState;
  242. /* blockSize samples are read from the state buffer */
  243. arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb,
  244. (int32_t) blockSize, 1, blockSize);
  245. /* Working pointer for the scratch buffer of state values */
  246. px = pb;
  247. /* Working pointer for scratch buffer of output values */
  248. pScratchOut = pScr2;
  249. /* Loop over the blockSize */
  250. blkCnt = blockSize;
  251. while(blkCnt > 0u)
  252. {
  253. /* Perform multiplication and store in the scratch buffer */
  254. *pScratchOut++ = ((q31_t) * px++ * coeff);
  255. /* Decrement the loop counter */
  256. blkCnt--;
  257. }
  258. /* Load the coefficient value and
  259. * increment the coefficient buffer for the next set of state values */
  260. coeff = *pCoeffs++;
  261. /* Read Index, from where the state buffer should be read, is calculated. */
  262. readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
  263. /* Wraparound of readIndex */
  264. if(readIndex < 0)
  265. {
  266. readIndex += (int32_t) delaySize;
  267. }
  268. /* Loop over the number of taps. */
  269. tapCnt = (uint32_t) numTaps - 1u;
  270. while(tapCnt > 0u)
  271. {
  272. /* Working pointer for state buffer is updated */
  273. py = pState;
  274. /* blockSize samples are read from the state buffer */
  275. arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb,
  276. (int32_t) blockSize, 1, blockSize);
  277. /* Working pointer for the scratch buffer of state values */
  278. px = pb;
  279. /* Working pointer for scratch buffer of output values */
  280. pScratchOut = pScr2;
  281. /* Loop over the blockSize */
  282. blkCnt = blockSize;
  283. while(blkCnt > 0u)
  284. {
  285. /* Perform Multiply-Accumulate */
  286. in = *pScratchOut + ((q31_t) * px++ * coeff);
  287. *pScratchOut++ = in;
  288. /* Decrement the loop counter */
  289. blkCnt--;
  290. }
  291. /* Load the coefficient value and
  292. * increment the coefficient buffer for the next set of state values */
  293. coeff = *pCoeffs++;
  294. /* Read Index, from where the state buffer should be read, is calculated. */
  295. readIndex =
  296. ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
  297. /* Wraparound of readIndex */
  298. if(readIndex < 0)
  299. {
  300. readIndex += (int32_t) delaySize;
  301. }
  302. /* Decrement the tap loop counter */
  303. tapCnt--;
  304. }
  305. /* All the output values are in pScratchOut buffer.
  306. Convert them into 1.15 format, saturate and store in the destination buffer. */
  307. /* Loop over the blockSize. */
  308. blkCnt = blockSize;
  309. while(blkCnt > 0u)
  310. {
  311. *pOut++ = (q7_t) __SSAT(*pScr2++ >> 7, 8);
  312. /* Decrement the blockSize loop counter */
  313. blkCnt--;
  314. }
  315. #endif /* #ifndef ARM_MATH_CM0_FAMILY */
  316. }
  317. /**
  318. * @} end of FIR_Sparse group
  319. */