dropLin hai 3 semanas
achega
6c8339cc35
Modificáronse 100 ficheiros con 36492 adicións e 0 borrados
  1. 11 0
      .gitignore
  2. 97 0
      README.md
  3. 384 0
      at32f413_board/at32f413_board.c
  4. 148 0
      at32f413_board/at32f413_board.h
  5. 7 0
      gitFocePushRemote.bat
  6. 44 0
      inc/at32f413_clock.h
  7. 166 0
      inc/at32f413_conf.h
  8. 56 0
      inc/at32f413_int.h
  9. 86 0
      inc/board.h
  10. 30 0
      keilkilll.bat
  11. 517 0
      libraries/cmsis/cm4/core_support/arm_common_tables.h
  12. 76 0
      libraries/cmsis/cm4/core_support/arm_const_structs.h
  13. 348 0
      libraries/cmsis/cm4/core_support/arm_helium_utils.h
  14. 8970 0
      libraries/cmsis/cm4/core_support/arm_math.h
  15. 235 0
      libraries/cmsis/cm4/core_support/arm_mve_tables.h
  16. 372 0
      libraries/cmsis/cm4/core_support/arm_vec_math.h
  17. 885 0
      libraries/cmsis/cm4/core_support/cmsis_armcc.h
  18. 1467 0
      libraries/cmsis/cm4/core_support/cmsis_armclang.h
  19. 1893 0
      libraries/cmsis/cm4/core_support/cmsis_armclang_ltm.h
  20. 283 0
      libraries/cmsis/cm4/core_support/cmsis_compiler.h
  21. 2177 0
      libraries/cmsis/cm4/core_support/cmsis_gcc.h
  22. 968 0
      libraries/cmsis/cm4/core_support/cmsis_iccarm.h
  23. 39 0
      libraries/cmsis/cm4/core_support/cmsis_version.h
  24. 2129 0
      libraries/cmsis/cm4/core_support/core_cm4.h
  25. 275 0
      libraries/cmsis/cm4/core_support/mpu_armv7.h
  26. 352 0
      libraries/cmsis/cm4/core_support/mpu_armv8.h
  27. 337 0
      libraries/cmsis/cm4/core_support/pmu_armv8.h
  28. 419 0
      libraries/cmsis/cm4/device_support/at32f413.h
  29. 150 0
      libraries/cmsis/cm4/device_support/at32f413_conf_template.h
  30. 168 0
      libraries/cmsis/cm4/device_support/startup/gcc/linker/AT32F413x8_FLASH.ld
  31. 168 0
      libraries/cmsis/cm4/device_support/startup/gcc/linker/AT32F413xB_FLASH.ld
  32. 168 0
      libraries/cmsis/cm4/device_support/startup/gcc/linker/AT32F413xC_FLASH.ld
  33. 431 0
      libraries/cmsis/cm4/device_support/startup/gcc/startup_at32f413.s
  34. 30 0
      libraries/cmsis/cm4/device_support/startup/iar/linker/AT32F413x8.icf
  35. 30 0
      libraries/cmsis/cm4/device_support/startup/iar/linker/AT32F413xB.icf
  36. 30 0
      libraries/cmsis/cm4/device_support/startup/iar/linker/AT32F413xC.icf
  37. 496 0
      libraries/cmsis/cm4/device_support/startup/iar/startup_at32f413.s
  38. 357 0
      libraries/cmsis/cm4/device_support/startup/mdk/startup_at32f413.s
  39. 189 0
      libraries/cmsis/cm4/device_support/system_at32f413.c
  40. 89 0
      libraries/cmsis/cm4/device_support/system_at32f413.h
  41. 414 0
      libraries/cmsis/dsp/ComputeLibrary/Include/NEMath.h
  42. 21 0
      libraries/cmsis/dsp/ComputeLibrary/LICENSE.txt
  43. 19 0
      libraries/cmsis/dsp/ComputeLibrary/README.md
  44. 55 0
      libraries/cmsis/dsp/ComputeLibrary/Source/arm_cl_tables.c
  45. 200 0
      libraries/cmsis/dsp/PrivateInclude/arm_sorting.h
  46. 58 0
      libraries/cmsis/dsp/PrivateInclude/arm_vec_fft.h
  47. 1661 0
      libraries/cmsis/dsp/PrivateInclude/arm_vec_filtering.h
  48. 75 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/BasicMathFunctions.c
  49. 19 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/CMakeLists.txt
  50. 196 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/arm_abs_f32.c
  51. 178 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/arm_abs_q15.c
  52. 208 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/arm_abs_q31.c
  53. 180 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/arm_abs_q7.c
  54. 199 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/arm_add_f32.c
  55. 176 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/arm_add_q15.c
  56. 159 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/arm_add_q31.c
  57. 158 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/arm_add_q7.c
  58. 137 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/arm_and_u16.c
  59. 129 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/arm_and_u32.c
  60. 130 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/arm_and_u8.c
  61. 226 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/arm_dot_prod_f32.c
  62. 172 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/arm_dot_prod_q15.c
  63. 174 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/arm_dot_prod_q31.c
  64. 191 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/arm_dot_prod_q7.c
  65. 200 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/arm_mult_f32.c
  66. 192 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/arm_mult_q15.c
  67. 168 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/arm_mult_q31.c
  68. 168 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/arm_mult_q7.c
  69. 192 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/arm_negate_f32.c
  70. 171 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/arm_negate_q15.c
  71. 178 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/arm_negate_q31.c
  72. 171 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/arm_negate_q7.c
  73. 130 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/arm_not_u16.c
  74. 122 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/arm_not_u32.c
  75. 122 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/arm_not_u8.c
  76. 196 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/arm_offset_f32.c
  77. 168 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/arm_offset_q15.c
  78. 175 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/arm_offset_q31.c
  79. 162 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/arm_offset_q7.c
  80. 137 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/arm_or_u16.c
  81. 128 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/arm_or_u32.c
  82. 128 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/arm_or_u8.c
  83. 216 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/arm_scale_f32.c
  84. 201 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/arm_scale_q15.c
  85. 244 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/arm_scale_q31.c
  86. 186 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/arm_scale_q7.c
  87. 251 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/arm_shift_q15.c
  88. 232 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/arm_shift_q31.c
  89. 225 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/arm_shift_q7.c
  90. 202 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/arm_sub_f32.c
  91. 178 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/arm_sub_q15.c
  92. 159 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/arm_sub_q31.c
  93. 158 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/arm_sub_q7.c
  94. 137 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/arm_xor_u16.c
  95. 129 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/arm_xor_u32.c
  96. 129 0
      libraries/cmsis/dsp/Source/BasicMathFunctions/arm_xor_u8.c
  97. 29 0
      libraries/cmsis/dsp/Source/BayesFunctions/BayesFunctions.c
  98. 19 0
      libraries/cmsis/dsp/Source/BayesFunctions/CMakeLists.txt
  99. 397 0
      libraries/cmsis/dsp/Source/BayesFunctions/arm_gaussian_naive_bayes_predict_f32.c
  100. 280 0
      libraries/cmsis/dsp/Source/CMakeLists.txt

+ 11 - 0
.gitignore

@@ -0,0 +1,11 @@
+.vscode
+*.html
+gitPushRemote.bat
+gitInit.bat
+# 忽略my_folder文件夹下的所有文件和文件夹  
+keil_v5/*  
+  
+# 但不忽略keil_v5文件夹下的.uvprojx和.uvproj文件  
+!keil_v5/Listings/*.bin 
+!keil_v5/*.uvprojx  
+!keil_v5/*.uvoptx

A diferenza do arquivo foi suprimida porque é demasiado grande
+ 97 - 0
README.md


+ 384 - 0
at32f413_board/at32f413_board.c

@@ -0,0 +1,384 @@
+/**
+  **************************************************************************
+  * @file     at32f413_board.c
+  * @brief    set of firmware functions to manage leds and push-button.
+  *           initialize delay function.
+  **************************************************************************
+  *                       Copyright notice & Disclaimer
+  *
+  * The software Board Support Package (BSP) that is made available to
+  * download from Artery official website is the copyrighted work of Artery.
+  * Artery authorizes customers to use, copy, and distribute the BSP
+  * software and its related documentation for the purpose of design and
+  * development in conjunction with Artery microcontrollers. Use of the
+  * software is governed by this copyright notice and the following disclaimer.
+  *
+  * THIS SOFTWARE IS PROVIDED ON "AS IS" BASIS WITHOUT WARRANTIES,
+  * GUARANTEES OR REPRESENTATIONS OF ANY KIND. ARTERY EXPRESSLY DISCLAIMS,
+  * TO THE FULLEST EXTENT PERMITTED BY LAW, ALL EXPRESS, IMPLIED OR
+  * STATUTORY OR OTHER WARRANTIES, GUARANTEES OR REPRESENTATIONS,
+  * INCLUDING BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
+  *
+  **************************************************************************
+  */
+#include "at32f413_board.h"
+
+/** @addtogroup AT32F413_board
+  * @{
+  */
+
+/** @defgroup BOARD
+  * @brief onboard periph driver
+  * @{
+  */
+
+/* delay macros */
+#define STEP_DELAY_MS                    50
+
+/* at-start led resouce array */
+gpio_type *led_gpio_port[LED_NUM]        = {LED2_GPIO, LED3_GPIO, LED4_GPIO};
+uint16_t led_gpio_pin[LED_NUM]           = {LED2_PIN, LED3_PIN, LED4_PIN};
+crm_periph_clock_type led_gpio_crm_clk[LED_NUM] = {LED2_GPIO_CRM_CLK, LED3_GPIO_CRM_CLK, LED4_GPIO_CRM_CLK};
+
+/* delay variable */
+static __IO uint32_t fac_us;
+static __IO uint32_t fac_ms;
+
+/* support printf function, usemicrolib is unnecessary */
+#if (__ARMCC_VERSION > 6000000)
+  __asm (".global __use_no_semihosting\n\t");
+  void _sys_exit(int x)
+  {
+    x = x;
+  }
+  /* __use_no_semihosting was requested, but _ttywrch was */
+  void _ttywrch(int ch)
+  {
+    ch = ch;
+  }
+  FILE __stdout;
+#else
+ #ifdef __CC_ARM
+  #pragma import(__use_no_semihosting)
+  struct __FILE
+  {
+    int handle;
+  };
+  FILE __stdout;
+  void _sys_exit(int x)
+  {
+    x = x;
+  }
+  /* __use_no_semihosting was requested, but _ttywrch was */
+  void _ttywrch(int ch)
+  {
+    ch = ch;
+  }
+ #endif
+#endif
+
+#if defined (__GNUC__) && !defined (__clang__)
+  #define PUTCHAR_PROTOTYPE int __io_putchar(int ch)
+#else
+  #define PUTCHAR_PROTOTYPE int fputc(int ch, FILE *f)
+#endif
+  
+/**
+  * @brief  retargets the c library printf function to the usart.
+  * @param  none
+  * @retval none
+  */
+PUTCHAR_PROTOTYPE
+{
+  while(usart_flag_get(PRINT_UART, USART_TDBE_FLAG) == RESET);
+  usart_data_transmit(PRINT_UART, (uint16_t)ch);
+  while(usart_flag_get(PRINT_UART, USART_TDC_FLAG) == RESET);
+  return ch;
+}
+
+#if (defined (__GNUC__) && !defined (__clang__)) || (defined (__ICCARM__))
+#if defined (__GNUC__) && !defined (__clang__)
+int _write(int fd, char *pbuffer, int size)
+#elif defined ( __ICCARM__ )
+#pragma module_name = "?__write"
+int __write(int fd, char *pbuffer, int size)
+#endif
+{
+  for(int i = 0; i < size; i ++)
+  {
+    while(usart_flag_get(PRINT_UART, USART_TDBE_FLAG) == RESET);
+    usart_data_transmit(PRINT_UART, (uint16_t)(*pbuffer++));
+    while(usart_flag_get(PRINT_UART, USART_TDC_FLAG) == RESET);
+  }
+
+  return size;
+}
+#endif
+
+/**
+  * @brief  initialize uart
+  * @param  baudrate: uart baudrate
+  * @retval none
+  */
+void uart_print_init(uint32_t baudrate)
+{
+  gpio_init_type gpio_init_struct;
+
+#if defined (__GNUC__) && !defined (__clang__)
+  setvbuf(stdout, NULL, _IONBF, 0);
+#endif
+
+  /* enable the uart and gpio clock */
+  crm_periph_clock_enable(PRINT_UART_CRM_CLK, TRUE);
+  crm_periph_clock_enable(PRINT_UART_TX_GPIO_CRM_CLK, TRUE);
+
+  gpio_default_para_init(&gpio_init_struct);
+
+  /* configure the uart tx pin */
+  gpio_init_struct.gpio_drive_strength = GPIO_DRIVE_STRENGTH_STRONGER;
+  gpio_init_struct.gpio_out_type  = GPIO_OUTPUT_PUSH_PULL;
+  gpio_init_struct.gpio_mode = GPIO_MODE_MUX;
+  gpio_init_struct.gpio_pins = PRINT_UART_TX_PIN;
+  gpio_init_struct.gpio_pull = GPIO_PULL_NONE;
+  gpio_init(PRINT_UART_TX_GPIO, &gpio_init_struct);
+
+  /* configure uart param */
+  usart_init(PRINT_UART, baudrate, USART_DATA_8BITS, USART_STOP_1_BIT);
+  usart_transmitter_enable(PRINT_UART, TRUE);
+  usart_enable(PRINT_UART, TRUE);
+}
+
+/**
+  * @brief  board initialize interface init led and button
+  * @param  none
+  * @retval none
+  */
+void at32_board_init()
+{
+  /* initialize delay function */
+  delay_init();
+
+}
+
+/**
+  * @brief  configure button gpio
+  * @param  button: specifies the button to be configured.
+  * @retval none
+  */
+void at32_button_init(void)
+{
+  gpio_init_type gpio_init_struct;
+
+  /* enable the button clock */
+  crm_periph_clock_enable(USER_BUTTON_CRM_CLK, TRUE);
+
+  /* set default parameter */
+  gpio_default_para_init(&gpio_init_struct);
+
+  /* configure button pin as input with pull-up/pull-down */
+  gpio_init_struct.gpio_drive_strength = GPIO_DRIVE_STRENGTH_STRONGER;
+  gpio_init_struct.gpio_out_type  = GPIO_OUTPUT_PUSH_PULL;
+  gpio_init_struct.gpio_mode = GPIO_MODE_INPUT;
+  gpio_init_struct.gpio_pins = USER_BUTTON_PIN;
+  gpio_init_struct.gpio_pull = GPIO_PULL_DOWN;
+  gpio_init(USER_BUTTON_PORT, &gpio_init_struct);
+}
+
+/**
+  * @brief  returns the selected button state
+  * @param  none
+  * @retval the button gpio pin value
+  */
+uint8_t at32_button_state(void)
+{
+  return gpio_input_data_bit_read(USER_BUTTON_PORT, USER_BUTTON_PIN);
+}
+
+/**
+  * @brief  returns which button have press down
+  * @param  none
+  * @retval the button have press down
+  */
+button_type at32_button_press()
+{
+  static uint8_t pressed = 1;
+  /* get button state in at_start board */
+  if((pressed == 1) && (at32_button_state() != RESET))
+  {
+    /* debounce */
+    pressed = 0;
+    delay_ms(10);
+    if(at32_button_state() != RESET)
+      return USER_BUTTON;
+  }
+  else if(at32_button_state() == RESET)
+  {
+    pressed = 1;
+  }
+  return NO_BUTTON;
+}
+
+/**
+  * @brief  configure led gpio
+  * @param  led: specifies the led to be configured.
+  * @retval none
+  */
+void at32_led_init(led_type led)
+{
+  gpio_init_type gpio_init_struct;
+
+  /* enable the led clock */
+  crm_periph_clock_enable(led_gpio_crm_clk[led], TRUE);
+
+  /* set default parameter */
+  gpio_default_para_init(&gpio_init_struct);
+
+  /* configure the led gpio */
+  gpio_init_struct.gpio_drive_strength = GPIO_DRIVE_STRENGTH_STRONGER;
+  gpio_init_struct.gpio_out_type  = GPIO_OUTPUT_PUSH_PULL;
+  gpio_init_struct.gpio_mode = GPIO_MODE_OUTPUT;
+  gpio_init_struct.gpio_pins = led_gpio_pin[led];
+  gpio_init_struct.gpio_pull = GPIO_PULL_NONE;
+  gpio_init(led_gpio_port[led], &gpio_init_struct);
+}
+
+/**
+  * @brief  turns selected led on.
+  * @param  led: specifies the led to be set on.
+  *   this parameter can be one of following parameters:
+  *     @arg LED2
+  *     @arg LED3
+  *     @arg LED4
+  * @retval none
+  */
+void at32_led_on(led_type led)
+{
+  if(led > (LED_NUM - 1))
+    return;
+  if(led_gpio_pin[led])
+    led_gpio_port[led]->clr = led_gpio_pin[led];
+}
+
+/**
+  * @brief  turns selected led off.
+  * @param  led: specifies the led to be set off.
+  *   this parameter can be one of following parameters:
+  *     @arg LED2
+  *     @arg LED3
+  *     @arg LED4
+  * @retval none
+  */
+void at32_led_off(led_type led)
+{
+  if(led > (LED_NUM - 1))
+    return;
+  if(led_gpio_pin[led])
+    led_gpio_port[led]->scr = led_gpio_pin[led];
+}
+
+/**
+  * @brief  turns selected led toggle.
+  * @param  led: specifies the led to be set off.
+  *   this parameter can be one of following parameters:
+  *     @arg LED2
+  *     @arg LED3
+  *     @arg LED4
+  * @retval none
+  */
+void at32_led_toggle(led_type led)
+{
+  if(led > (LED_NUM - 1))
+    return;
+  if(led_gpio_pin[led])
+    led_gpio_port[led]->odt ^= led_gpio_pin[led];
+}
+
+/**
+  * @brief  initialize delay function
+  * @param  none
+  * @retval none
+  */
+void delay_init()
+{
+  /* configure systick */
+  systick_clock_source_config(SYSTICK_CLOCK_SOURCE_AHBCLK_NODIV);
+  fac_us = system_core_clock / (1000000U);
+  fac_ms = fac_us * (1000U);
+}
+
+/**
+  * @brief  inserts a delay time.
+  * @param  nus: specifies the delay time length, in microsecond.
+  * @retval none
+  */
+void delay_us(uint32_t nus)
+{
+  uint32_t temp = 0;
+  SysTick->LOAD = (uint32_t)(nus * fac_us);
+  SysTick->VAL = 0x00;
+  SysTick->CTRL |= SysTick_CTRL_ENABLE_Msk ;
+  do
+  {
+    temp = SysTick->CTRL;
+  }while((temp & 0x01) && !(temp & (1 << 16)));
+
+  SysTick->CTRL &= ~SysTick_CTRL_ENABLE_Msk;
+  SysTick->VAL = 0x00;
+}
+
+/**
+  * @brief  inserts a delay time.
+  * @param  nms: specifies the delay time length, in milliseconds.
+  * @retval none
+  */
+void delay_ms(uint16_t nms)
+{
+  uint32_t temp = 0;
+  while(nms)
+  {
+    if(nms > STEP_DELAY_MS)
+    {
+      SysTick->LOAD = (uint32_t)(STEP_DELAY_MS * fac_ms);
+      nms -= STEP_DELAY_MS;
+    }
+    else
+    {
+      SysTick->LOAD = (uint32_t)(nms * fac_ms);
+      nms = 0;
+    }
+    SysTick->VAL = 0x00;
+    SysTick->CTRL |= SysTick_CTRL_ENABLE_Msk;
+    do
+    {
+      temp = SysTick->CTRL;
+    }while((temp & 0x01) && !(temp & (1 << 16)));
+
+    SysTick->CTRL &= ~SysTick_CTRL_ENABLE_Msk;
+    SysTick->VAL = 0x00;
+  }
+}
+
+/**
+  * @brief  inserts a delay time.
+  * @param  sec: specifies the delay time, in seconds.
+  * @retval none
+  */
+void delay_sec(uint16_t sec)
+{
+  uint16_t index;
+  for(index = 0; index < sec; index++)
+  {
+    delay_ms(500);
+    delay_ms(500);
+  }
+}
+
+/**
+  * @}
+  */
+
+/**
+  * @}
+  */
+

+ 148 - 0
at32f413_board/at32f413_board.h

@@ -0,0 +1,148 @@
+/**
+  **************************************************************************
+  * @file     at32f413_board.h
+  * @brief    header file for at-start board. set of firmware functions to
+  *           manage leds and push-button. initialize delay function.
+  **************************************************************************
+  *                       Copyright notice & Disclaimer
+  *
+  * The software Board Support Package (BSP) that is made available to
+  * download from Artery official website is the copyrighted work of Artery.
+  * Artery authorizes customers to use, copy, and distribute the BSP
+  * software and its related documentation for the purpose of design and
+  * development in conjunction with Artery microcontrollers. Use of the
+  * software is governed by this copyright notice and the following disclaimer.
+  *
+  * THIS SOFTWARE IS PROVIDED ON "AS IS" BASIS WITHOUT WARRANTIES,
+  * GUARANTEES OR REPRESENTATIONS OF ANY KIND. ARTERY EXPRESSLY DISCLAIMS,
+  * TO THE FULLEST EXTENT PERMITTED BY LAW, ALL EXPRESS, IMPLIED OR
+  * STATUTORY OR OTHER WARRANTIES, GUARANTEES OR REPRESENTATIONS,
+  * INCLUDING BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
+  *
+  **************************************************************************
+  */
+
+#ifndef __AT32F413_BOARD_H
+#define __AT32F413_BOARD_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "stdio.h"
+#include "at32f413.h"
+
+/** @addtogroup AT32F413_board
+  * @{
+  */
+
+/** @addtogroup BOARD
+  * @{
+  */
+
+/** @defgroup BOARD_pins_definition
+  * @{
+  */
+
+/**
+  * this header include define support list:
+  * 1. at-start-f413 v1.x board
+  * if define AT_START_F413_V1, the header file support at-start-f413 v1.x board
+  */
+
+#if !defined (AT_START_F413_V1)
+#error "please select first the board at-start device used in your application (in at32f413_board.h file)"
+#endif
+
+/******************** define led ********************/
+typedef enum
+{
+  LED2                                   = 0,
+  LED3                                   = 1,
+  LED4                                   = 2
+} led_type;
+
+#define LED_NUM                          3
+
+#if defined (AT_START_F413_V1)
+#define LED2_PIN                         GPIO_PINS_2
+#define LED2_GPIO                        GPIOC
+#define LED2_GPIO_CRM_CLK                CRM_GPIOC_PERIPH_CLOCK
+
+#define LED3_PIN                         GPIO_PINS_3
+#define LED3_GPIO                        GPIOC
+#define LED3_GPIO_CRM_CLK                CRM_GPIOC_PERIPH_CLOCK
+
+#define LED4_PIN                         GPIO_PINS_5
+#define LED4_GPIO                        GPIOC
+#define LED4_GPIO_CRM_CLK                CRM_GPIOC_PERIPH_CLOCK
+#endif
+
+/**************** define print uart ******************/
+#define PRINT_UART                       USART1
+#define PRINT_UART_CRM_CLK               CRM_USART1_PERIPH_CLOCK
+#define PRINT_UART_TX_PIN                GPIO_PINS_9
+#define PRINT_UART_TX_GPIO               GPIOA
+#define PRINT_UART_TX_GPIO_CRM_CLK       CRM_GPIOA_PERIPH_CLOCK
+
+/******************* define button *******************/
+typedef enum
+{
+  USER_BUTTON                            = 0,
+  NO_BUTTON                              = 1
+} button_type;
+
+#define USER_BUTTON_PIN                  GPIO_PINS_0
+#define USER_BUTTON_PORT                 GPIOA
+#define USER_BUTTON_CRM_CLK              CRM_GPIOA_PERIPH_CLOCK
+
+/**
+  * @}
+  */
+
+/** @defgroup BOARD_exported_functions
+  * @{
+  */
+
+/******************** functions ********************/
+void at32_board_init(void);
+
+/* led operation function */
+void at32_led_init(led_type led);
+void at32_led_on(led_type led);
+void at32_led_off(led_type led);
+void at32_led_toggle(led_type led);
+
+/* button operation function */
+void at32_button_init(void);
+button_type at32_button_press(void);
+uint8_t at32_button_state(void);
+
+/* delay function */
+void delay_init(void);
+void delay_us(uint32_t nus);
+void delay_ms(uint16_t nms);
+void delay_sec(uint16_t sec);
+
+/* printf uart init function */
+void uart_print_init(uint32_t baudrate);
+
+/**
+  * @}
+  */
+
+/**
+  * @}
+  */
+
+/**
+  * @}
+  */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+

+ 7 - 0
gitFocePushRemote.bat

@@ -0,0 +1,7 @@
+@echo off  
+setlocal  
+  
+:: 假设你已经处于正确的Git仓库目录中  
+  
+:: 添加所有更改到暂存区  
+git push -f --set-upstream origin master:master

+ 44 - 0
inc/at32f413_clock.h

@@ -0,0 +1,44 @@
+/**
+  **************************************************************************
+  * @file     at32f413_clock.h
+  * @brief    header file of clock program
+  **************************************************************************
+  *                       Copyright notice & Disclaimer
+  *
+  * The software Board Support Package (BSP) that is made available to
+  * download from Artery official website is the copyrighted work of Artery.
+  * Artery authorizes customers to use, copy, and distribute the BSP
+  * software and its related documentation for the purpose of design and
+  * development in conjunction with Artery microcontrollers. Use of the
+  * software is governed by this copyright notice and the following disclaimer.
+  *
+  * THIS SOFTWARE IS PROVIDED ON "AS IS" BASIS WITHOUT WARRANTIES,
+  * GUARANTEES OR REPRESENTATIONS OF ANY KIND. ARTERY EXPRESSLY DISCLAIMS,
+  * TO THE FULLEST EXTENT PERMITTED BY LAW, ALL EXPRESS, IMPLIED OR
+  * STATUTORY OR OTHER WARRANTIES, GUARANTEES OR REPRESENTATIONS,
+  * INCLUDING BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
+  *
+  **************************************************************************
+  */
+
+/* define to prevent recursive inclusion -------------------------------------*/
+#ifndef __AT32F413_CLOCK_H
+#define __AT32F413_CLOCK_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* includes ------------------------------------------------------------------*/
+#include "at32f413.h"
+
+/* exported functions ------------------------------------------------------- */
+void system_clock_config(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __AT32F413_CLOCK_H */
+

+ 166 - 0
inc/at32f413_conf.h

@@ -0,0 +1,166 @@
+/**
+  **************************************************************************
+  * @file     at32f413_conf.h
+  * @brief    at32f413 config header file
+  **************************************************************************
+  *                       Copyright notice & Disclaimer
+  *
+  * The software Board Support Package (BSP) that is made available to
+  * download from Artery official website is the copyrighted work of Artery.
+  * Artery authorizes customers to use, copy, and distribute the BSP
+  * software and its related documentation for the purpose of design and
+  * development in conjunction with Artery microcontrollers. Use of the
+  * software is governed by this copyright notice and the following disclaimer.
+  *
+  * THIS SOFTWARE IS PROVIDED ON "AS IS" BASIS WITHOUT WARRANTIES,
+  * GUARANTEES OR REPRESENTATIONS OF ANY KIND. ARTERY EXPRESSLY DISCLAIMS,
+  * TO THE FULLEST EXTENT PERMITTED BY LAW, ALL EXPRESS, IMPLIED OR
+  * STATUTORY OR OTHER WARRANTIES, GUARANTEES OR REPRESENTATIONS,
+  * INCLUDING BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
+  *
+  **************************************************************************
+  */
+
+/* define to prevent recursive inclusion -------------------------------------*/
+#ifndef __AT32F413_CONF_H
+#define __AT32F413_CONF_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** @addtogroup AT32F413_periph_template
+  * @{
+  */
+
+/** @addtogroup 413_Library_configuration Library_configuration
+  * @{
+  */
+
+/**
+  * @brief in the following line adjust the value of high speed external crystal (hext)
+  * used in your application
+  *
+  * tip: to avoid modifying this file each time you need to use different hext, you
+  *      can define the hext value in your toolchain compiler preprocessor.
+  *
+  */
+#if !defined  HEXT_VALUE
+#define HEXT_VALUE               ((uint32_t)8000000) /*!< value of the high speed external crystal in hz */
+#endif
+
+/**
+  * @brief in the following line adjust the high speed external crystal (hext) startup
+  * timeout value
+  */
+#define HEXT_STARTUP_TIMEOUT             ((uint16_t)0x3000)  /*!< time out for hext start up */
+#define HICK_VALUE                       ((uint32_t)8000000) /*!< value of the high speed internal clock in hz */
+#define LEXT_VALUE                       ((uint32_t)32768)   /*!< value of the low speed external clock in hz */
+
+/* module define -------------------------------------------------------------*/
+#define CRM_MODULE_ENABLED
+#define TMR_MODULE_ENABLED
+#define RTC_MODULE_ENABLED
+#define BPR_MODULE_ENABLED
+#define GPIO_MODULE_ENABLED
+#define I2C_MODULE_ENABLED
+#define USART_MODULE_ENABLED
+#define PWC_MODULE_ENABLED
+#define CAN_MODULE_ENABLED
+#define ADC_MODULE_ENABLED
+#define SPI_MODULE_ENABLED
+#define DMA_MODULE_ENABLED
+#define DEBUG_MODULE_ENABLED
+#define FLASH_MODULE_ENABLED
+#define CRC_MODULE_ENABLED
+#define WWDT_MODULE_ENABLED
+#define WDT_MODULE_ENABLED
+#define EXINT_MODULE_ENABLED
+#define SDIO_MODULE_ENABLED
+#define USB_MODULE_ENABLED
+#define ACC_MODULE_ENABLED
+#define MISC_MODULE_ENABLED
+
+/* includes ------------------------------------------------------------------*/
+#ifdef CRM_MODULE_ENABLED
+#include "at32f413_crm.h"
+#endif
+#ifdef TMR_MODULE_ENABLED
+#include "at32f413_tmr.h"
+#endif
+#ifdef RTC_MODULE_ENABLED
+#include "at32f413_rtc.h"
+#endif
+#ifdef BPR_MODULE_ENABLED
+#include "at32f413_bpr.h"
+#endif
+#ifdef GPIO_MODULE_ENABLED
+#include "at32f413_gpio.h"
+#endif
+#ifdef I2C_MODULE_ENABLED
+#include "at32f413_i2c.h"
+#endif
+#ifdef USART_MODULE_ENABLED
+#include "at32f413_usart.h"
+#endif
+#ifdef PWC_MODULE_ENABLED
+#include "at32f413_pwc.h"
+#endif
+#ifdef CAN_MODULE_ENABLED
+#include "at32f413_can.h"
+#endif
+#ifdef ADC_MODULE_ENABLED
+#include "at32f413_adc.h"
+#endif
+#ifdef SPI_MODULE_ENABLED
+#include "at32f413_spi.h"
+#endif
+#ifdef DMA_MODULE_ENABLED
+#include "at32f413_dma.h"
+#endif
+#ifdef DEBUG_MODULE_ENABLED
+#include "at32f413_debug.h"
+#endif
+#ifdef FLASH_MODULE_ENABLED
+#include "at32f413_flash.h"
+#endif
+#ifdef CRC_MODULE_ENABLED
+#include "at32f413_crc.h"
+#endif
+#ifdef WWDT_MODULE_ENABLED
+#include "at32f413_wwdt.h"
+#endif
+#ifdef WDT_MODULE_ENABLED
+#include "at32f413_wdt.h"
+#endif
+#ifdef EXINT_MODULE_ENABLED
+#include "at32f413_exint.h"
+#endif
+#ifdef SDIO_MODULE_ENABLED
+#include "at32f413_sdio.h"
+#endif
+#ifdef ACC_MODULE_ENABLED
+#include "at32f413_acc.h"
+#endif
+#ifdef MISC_MODULE_ENABLED
+#include "at32f413_misc.h"
+#endif
+#ifdef USB_MODULE_ENABLED
+#include "at32f413_usb.h"
+#endif
+
+/**
+  * @}
+  */
+
+  /**
+  * @}
+  */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif

+ 56 - 0
inc/at32f413_int.h

@@ -0,0 +1,56 @@
+/**
+  **************************************************************************
+  * @file     at32f413_int.h
+  * @brief    header file of main interrupt service routines.
+  **************************************************************************
+  *                       Copyright notice & Disclaimer
+  *
+  * The software Board Support Package (BSP) that is made available to
+  * download from Artery official website is the copyrighted work of Artery.
+  * Artery authorizes customers to use, copy, and distribute the BSP
+  * software and its related documentation for the purpose of design and
+  * development in conjunction with Artery microcontrollers. Use of the
+  * software is governed by this copyright notice and the following disclaimer.
+  *
+  * THIS SOFTWARE IS PROVIDED ON "AS IS" BASIS WITHOUT WARRANTIES,
+  * GUARANTEES OR REPRESENTATIONS OF ANY KIND. ARTERY EXPRESSLY DISCLAIMS,
+  * TO THE FULLEST EXTENT PERMITTED BY LAW, ALL EXPRESS, IMPLIED OR
+  * STATUTORY OR OTHER WARRANTIES, GUARANTEES OR REPRESENTATIONS,
+  * INCLUDING BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
+  *
+  **************************************************************************
+  */
+
+/* define to prevent recursive inclusion -------------------------------------*/
+#ifndef __AT32F413_INT_H
+#define __AT32F413_INT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* includes ------------------------------------------------------------------*/
+#include "at32f413.h"
+
+/* exported types ------------------------------------------------------------*/
+/* exported constants --------------------------------------------------------*/
+/* exported macro ------------------------------------------------------------*/
+/* exported functions ------------------------------------------------------- */
+
+void NMI_Handler(void);
+void HardFault_Handler(void);
+void MemManage_Handler(void);
+void BusFault_Handler(void);
+void UsageFault_Handler(void);
+void SVC_Handler(void);
+void DebugMon_Handler(void);
+void PendSV_Handler(void);
+void SysTick_Handler(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+

+ 86 - 0
inc/board.h

@@ -0,0 +1,86 @@
+#ifndef __BOARD_H
+#define __BOARD_H
+#include "at32f413_board.h"
+#include "at32f413_clock.h"
+
+//--------------------------------------------
+
+#define BOARD_PIN_H TRUE
+#define BOARD_PIN_L FALSE
+
+//-----------------------LED指示灯,低电平驱动
+#define BOARD_PIN_LED1    GPIO_PINS_3   //
+#define BOARD_PORT_LED1   GPIOA         
+#define BOARD_GPIO_LED1   BOARD_PORT_LED1, BOARD_PIN_LED1          //
+
+#define BOARD_PIN_LED2    GPIO_PINS_3
+#define BOARD_PORT_LED2   GPIOC
+#define BOARD_GPIO_LED2   BOARD_PORT_LED2, BOARD_PIN_LED2          //
+
+#define LED_ON   BOARD_PIN_H
+#define LED_OFF   BOARD_PIN_L
+
+#define BOARD_PIN_BEEP    GPIO_PINS_12
+#define BOARD_PORT_BEEP    GPIOA
+#define BOARD_GPIO_BEEP   BOARD_PORT_BEEP, BOARD_PIN_BEEP          //
+
+#define BEEP_ON   BOARD_PIN_L
+#define BEEP_OFF   BOARD_PIN_H
+//-----------------------------------------LED指示灯,END
+//-----------------------LCD显示器驱动IO口
+#define BG_LED_ON       BOARD_PIN_H
+#define BG_LED_OFF      BOARD_PIN_L
+#define BOARD_PIN_LCD_BG_LED    GPIO_PINS_4   //
+#define BOARD_PORT_LCD_BG_LED   GPIOB
+#define BOARD_GPIO_LCD_BG_LED   BOARD_PORT_LCD_BG_LED, BOARD_PIN_LCD_BG_LED          //
+
+#define BOARD_PIN_LCD_SDA    GPIO_PINS_7   //
+#define BOARD_PORT_LCD_SDA   GPIOB
+#define BOARD_GPIO_LCD_SDA   BOARD_PORT_LCD_SDA, BOARD_PIN_LCD_SDA          //
+
+#define BOARD_PIN_LCD_CLK    GPIO_PINS_6   //
+#define BOARD_PORT_LCD_CLK   GPIOB
+#define BOARD_GPIO_LCD_CLK   BOARD_PORT_LCD_CLK, BOARD_PIN_LCD_CLK          //
+
+#define BOARD_PIN_LCD_RST    GPIO_PINS_5   //
+#define BOARD_PORT_LCD_RST   GPIOB
+#define BOARD_GPIO_LCD_RST   BOARD_PORT_LCD_RST, BOARD_PIN_LCD_RST          //
+
+#define LCD_SDA_H()     gpio_bits_write(BOARD_GPIO_LCD_SDA, BOARD_PIN_L)
+#define LCD_SDA_L()     gpio_bits_write(BOARD_GPIO_LCD_SDA, BOARD_PIN_H)
+#define READ_LCD_SDA()     gpio_output_data_bit_read(BOARD_GPIO_LCD_SDA)
+#define LCD_CLK_H()     gpio_bits_write(BOARD_GPIO_LCD_CLK, BOARD_PIN_L)
+#define LCD_CLK_L()     gpio_bits_write(BOARD_GPIO_LCD_CLK, BOARD_PIN_H)
+
+#define LCD_RST_ENABLE()     gpio_bits_write(BOARD_GPIO_LCD_RST, BOARD_PIN_H)
+#define LCD_RST_DISABLE()     gpio_bits_write(BOARD_GPIO_LCD_RST, BOARD_PIN_L)
+
+#define LCD_BG_LED_ON()     gpio_bits_write(BOARD_GPIO_LCD_BG_LED, BG_LED_OFF)
+#define LCD_BG_LED_OFF()     gpio_bits_write(BOARD_GPIO_LCD_BG_LED, BG_LED_ON)
+
+//-----------------------------------------LCD显示器驱动IO口,END
+
+#define BOARD_PIN_KEY1    GPIO_PINS_0
+#define BOARD_PORT_KEY1   GPIOA
+#define BOARD_GPIO_KEY1   BOARD_PORT_KEY1, BOARD_PIN_KEY1          //
+
+#define BOARD_PIN_KEY2    GPIO_PINS_14
+#define BOARD_PORT_KEY2   GPIOB
+#define BOARD_GPIO_KEY2   BOARD_PORT_KEY2, BOARD_PIN_KEY2          //
+
+#define BOARD_PIN_KEY3    GPIO_PINS_15
+#define BOARD_PORT_KEY3   GPIOB
+#define BOARD_GPIO_KEY3   BOARD_PORT_KEY3, BOARD_PIN_KEY3          //
+
+#define BOARD_PIN_KEY4    GPIO_PINS_6
+#define BOARD_PORT_KEY4   GPIOC
+#define BOARD_GPIO_KEY4   BOARD_PORT_KEY4, BOARD_PIN_KEY4          //
+
+#define BOARD_PIN_KEY5    GPIO_PINS_7
+#define BOARD_PORT_KEY5   GPIOC
+#define BOARD_GPIO_KEY5   BOARD_PORT_KEY5, BOARD_PIN_KEY5          //
+
+#define BOARD_PIN_CURRENT_AD    GPIO_PINS_1  //对应ADC1
+#define BOARD_PORT_CURRENT_AD   GPIOA       
+#define BOARD_GPIO_CURRENT_AD   BOARD_PORT_CURRENT_AD, BOARD_PIN_CURRENT_AD          //
+#endif

+ 30 - 0
keilkilll.bat

@@ -0,0 +1,30 @@
+del *.bak /s
+del *.ddk /s
+del *.edk /s
+del *.lst /s
+del *.lnp /s
+del *.mpf /s
+del *.mpj /s
+del *.obj /s
+del *.hex /s
+del *.bin /s
+del *.omf /s
+::del *.opt /s  ::²»ÔÊÐíɾ³ýJLINKµÄÉèÖÃ
+del *.plg /s
+del *.rpt /s
+del *.tmp /s
+del *.__i /s
+del *.crf /s
+del *.o /s
+del *.d /s
+del *.axf /s
+del *.tra /s
+del *.dep /s           
+del JLinkLog.txt /s
+
+del *.iex /s
+del *.htm /s
+del *.sct /s
+del *.map /s
+del *.hex /s
+exit

+ 517 - 0
libraries/cmsis/cm4/core_support/arm_common_tables.h

@@ -0,0 +1,517 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_common_tables.h
+ * Description:  Extern declaration for common tables
+ *
+ * $Date:        27. January 2017
+ * $Revision:    V.1.5.1
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _ARM_COMMON_TABLES_H
+#define _ARM_COMMON_TABLES_H
+
+#include "arm_math.h"
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
+  /* Double Precision Float CFFT twiddles */
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREV_1024)
+    extern const uint16_t armBitRevTable[1024];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F64_16)
+    extern const uint64_t twiddleCoefF64_16[32];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F64_32)
+    extern const uint64_t twiddleCoefF64_32[64];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F64_64)
+    extern const uint64_t twiddleCoefF64_64[128];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F64_128)
+    extern const uint64_t twiddleCoefF64_128[256];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F64_256)
+    extern const uint64_t twiddleCoefF64_256[512];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F64_512)
+    extern const uint64_t twiddleCoefF64_512[1024];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F64_1024)
+    extern const uint64_t twiddleCoefF64_1024[2048];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F64_2048)
+    extern const uint64_t twiddleCoefF64_2048[4096];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F64_4096)
+    extern const uint64_t twiddleCoefF64_4096[8192];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_16)
+    extern const float32_t twiddleCoef_16[32];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_32)
+    extern const float32_t twiddleCoef_32[64];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_64)
+    extern const float32_t twiddleCoef_64[128];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_128)
+    extern const float32_t twiddleCoef_128[256];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_256)
+    extern const float32_t twiddleCoef_256[512];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_512)
+    extern const float32_t twiddleCoef_512[1024];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_1024)
+    extern const float32_t twiddleCoef_1024[2048];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_2048)
+    extern const float32_t twiddleCoef_2048[4096];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_4096)
+    extern const float32_t twiddleCoef_4096[8192];
+    #define twiddleCoef twiddleCoef_4096
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_16)
+    extern const q31_t twiddleCoef_16_q31[24];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_32)
+    extern const q31_t twiddleCoef_32_q31[48];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_64)
+    extern const q31_t twiddleCoef_64_q31[96];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_128)
+    extern const q31_t twiddleCoef_128_q31[192];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_256)
+    extern const q31_t twiddleCoef_256_q31[384];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_512)
+    extern const q31_t twiddleCoef_512_q31[768];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_1024)
+    extern const q31_t twiddleCoef_1024_q31[1536];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_2048)
+    extern const q31_t twiddleCoef_2048_q31[3072];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_4096)
+    extern const q31_t twiddleCoef_4096_q31[6144];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_16)
+    extern const q15_t twiddleCoef_16_q15[24];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_32)
+    extern const q15_t twiddleCoef_32_q15[48];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_64)
+    extern const q15_t twiddleCoef_64_q15[96];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_128)
+    extern const q15_t twiddleCoef_128_q15[192];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_256)
+    extern const q15_t twiddleCoef_256_q15[384];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_512)
+    extern const q15_t twiddleCoef_512_q15[768];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_1024)
+    extern const q15_t twiddleCoef_1024_q15[1536];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_2048)
+    extern const q15_t twiddleCoef_2048_q15[3072];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_4096)
+    extern const q15_t twiddleCoef_4096_q15[6144];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  /* Double Precision Float RFFT twiddles */
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_32)
+    extern const uint64_t twiddleCoefF64_rfft_32[32];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_64)
+    extern const uint64_t twiddleCoefF64_rfft_64[64];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_128)
+    extern const uint64_t twiddleCoefF64_rfft_128[128];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_256)
+    extern const uint64_t twiddleCoefF64_rfft_256[256];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_512)
+    extern const uint64_t twiddleCoefF64_rfft_512[512];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_1024)
+    extern const uint64_t twiddleCoefF64_rfft_1024[1024];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_2048)
+    extern const uint64_t twiddleCoefF64_rfft_2048[2048];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_4096)
+    extern const uint64_t twiddleCoefF64_rfft_4096[4096];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_32)
+    extern const float32_t twiddleCoef_rfft_32[32];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_64)
+    extern const float32_t twiddleCoef_rfft_64[64];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_128)
+    extern const float32_t twiddleCoef_rfft_128[128];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_256)
+    extern const float32_t twiddleCoef_rfft_256[256];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_512)
+    extern const float32_t twiddleCoef_rfft_512[512];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_1024)
+    extern const float32_t twiddleCoef_rfft_1024[1024];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_2048)
+    extern const float32_t twiddleCoef_rfft_2048[2048];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_4096)
+    extern const float32_t twiddleCoef_rfft_4096[4096];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+
+  /* Double precision floating-point bit reversal tables */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT64_16)
+    #define ARMBITREVINDEXTABLEF64_16_TABLE_LENGTH ((uint16_t)12)
+    extern const uint16_t armBitRevIndexTableF64_16[ARMBITREVINDEXTABLEF64_16_TABLE_LENGTH];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT64_32)
+    #define ARMBITREVINDEXTABLEF64_32_TABLE_LENGTH ((uint16_t)24)
+    extern const uint16_t armBitRevIndexTableF64_32[ARMBITREVINDEXTABLEF64_32_TABLE_LENGTH];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT64_64)
+    #define ARMBITREVINDEXTABLEF64_64_TABLE_LENGTH ((uint16_t)56)
+    extern const uint16_t armBitRevIndexTableF64_64[ARMBITREVINDEXTABLEF64_64_TABLE_LENGTH];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT64_128)
+    #define ARMBITREVINDEXTABLEF64_128_TABLE_LENGTH ((uint16_t)112)
+    extern const uint16_t armBitRevIndexTableF64_128[ARMBITREVINDEXTABLEF64_128_TABLE_LENGTH];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT64_256)
+    #define ARMBITREVINDEXTABLEF64_256_TABLE_LENGTH ((uint16_t)240)
+    extern const uint16_t armBitRevIndexTableF64_256[ARMBITREVINDEXTABLEF64_256_TABLE_LENGTH];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT64_512)
+    #define ARMBITREVINDEXTABLEF64_512_TABLE_LENGTH ((uint16_t)480)
+    extern const uint16_t armBitRevIndexTableF64_512[ARMBITREVINDEXTABLEF64_512_TABLE_LENGTH];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT64_1024)
+    #define ARMBITREVINDEXTABLEF64_1024_TABLE_LENGTH ((uint16_t)992)
+    extern const uint16_t armBitRevIndexTableF64_1024[ARMBITREVINDEXTABLEF64_1024_TABLE_LENGTH];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT64_2048)
+    #define ARMBITREVINDEXTABLEF64_2048_TABLE_LENGTH ((uint16_t)1984)
+    extern const uint16_t armBitRevIndexTableF64_2048[ARMBITREVINDEXTABLEF64_2048_TABLE_LENGTH];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT64_4096)
+    #define ARMBITREVINDEXTABLEF64_4096_TABLE_LENGTH ((uint16_t)4032)
+    extern const uint16_t armBitRevIndexTableF64_4096[ARMBITREVINDEXTABLEF64_4096_TABLE_LENGTH];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+  /* floating-point bit reversal tables */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT_16)
+    #define ARMBITREVINDEXTABLE_16_TABLE_LENGTH ((uint16_t)20)
+    extern const uint16_t armBitRevIndexTable16[ARMBITREVINDEXTABLE_16_TABLE_LENGTH];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT_32)
+    #define ARMBITREVINDEXTABLE_32_TABLE_LENGTH ((uint16_t)48)
+    extern const uint16_t armBitRevIndexTable32[ARMBITREVINDEXTABLE_32_TABLE_LENGTH];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT_64)
+    #define ARMBITREVINDEXTABLE_64_TABLE_LENGTH ((uint16_t)56)
+    extern const uint16_t armBitRevIndexTable64[ARMBITREVINDEXTABLE_64_TABLE_LENGTH];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT_128)
+    #define ARMBITREVINDEXTABLE_128_TABLE_LENGTH ((uint16_t)208)
+    extern const uint16_t armBitRevIndexTable128[ARMBITREVINDEXTABLE_128_TABLE_LENGTH];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT_256)
+    #define ARMBITREVINDEXTABLE_256_TABLE_LENGTH ((uint16_t)440)
+    extern const uint16_t armBitRevIndexTable256[ARMBITREVINDEXTABLE_256_TABLE_LENGTH];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT_512)
+    #define ARMBITREVINDEXTABLE_512_TABLE_LENGTH ((uint16_t)448)
+    extern const uint16_t armBitRevIndexTable512[ARMBITREVINDEXTABLE_512_TABLE_LENGTH];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT_1024)
+    #define ARMBITREVINDEXTABLE_1024_TABLE_LENGTH ((uint16_t)1800)
+    extern const uint16_t armBitRevIndexTable1024[ARMBITREVINDEXTABLE_1024_TABLE_LENGTH];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT_2048)
+    #define ARMBITREVINDEXTABLE_2048_TABLE_LENGTH ((uint16_t)3808)
+    extern const uint16_t armBitRevIndexTable2048[ARMBITREVINDEXTABLE_2048_TABLE_LENGTH];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT_4096)
+    #define ARMBITREVINDEXTABLE_4096_TABLE_LENGTH ((uint16_t)4032)
+    extern const uint16_t armBitRevIndexTable4096[ARMBITREVINDEXTABLE_4096_TABLE_LENGTH];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+
+  /* fixed-point bit reversal tables */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_16)
+    #define ARMBITREVINDEXTABLE_FIXED_16_TABLE_LENGTH ((uint16_t)12)
+    extern const uint16_t armBitRevIndexTable_fixed_16[ARMBITREVINDEXTABLE_FIXED_16_TABLE_LENGTH];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_32)
+    #define ARMBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH ((uint16_t)24)
+    extern const uint16_t armBitRevIndexTable_fixed_32[ARMBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_64)
+    #define ARMBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH ((uint16_t)56)
+    extern const uint16_t armBitRevIndexTable_fixed_64[ARMBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_128)
+    #define ARMBITREVINDEXTABLE_FIXED_128_TABLE_LENGTH ((uint16_t)112)
+    extern const uint16_t armBitRevIndexTable_fixed_128[ARMBITREVINDEXTABLE_FIXED_128_TABLE_LENGTH];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_256)
+    #define ARMBITREVINDEXTABLE_FIXED_256_TABLE_LENGTH ((uint16_t)240)
+    extern const uint16_t armBitRevIndexTable_fixed_256[ARMBITREVINDEXTABLE_FIXED_256_TABLE_LENGTH];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_512)
+    #define ARMBITREVINDEXTABLE_FIXED_512_TABLE_LENGTH ((uint16_t)480)
+    extern const uint16_t armBitRevIndexTable_fixed_512[ARMBITREVINDEXTABLE_FIXED_512_TABLE_LENGTH];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_1024)
+    #define ARMBITREVINDEXTABLE_FIXED_1024_TABLE_LENGTH ((uint16_t)992)
+    extern const uint16_t armBitRevIndexTable_fixed_1024[ARMBITREVINDEXTABLE_FIXED_1024_TABLE_LENGTH];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_2048)
+    #define ARMBITREVINDEXTABLE_FIXED_2048_TABLE_LENGTH ((uint16_t)1984)
+    extern const uint16_t armBitRevIndexTable_fixed_2048[ARMBITREVINDEXTABLE_FIXED_2048_TABLE_LENGTH];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_4096)
+    #define ARMBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH ((uint16_t)4032)
+    extern const uint16_t armBitRevIndexTable_fixed_4096[ARMBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_REALCOEF_F32)
+    extern const float32_t realCoefA[8192];
+    extern const float32_t realCoefB[8192];
+  #endif
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_REALCOEF_Q31)
+    extern const q31_t realCoefAQ31[8192];
+    extern const q31_t realCoefBQ31[8192];
+  #endif
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_REALCOEF_Q15)
+    extern const q15_t realCoefAQ15[8192];
+    extern const q15_t realCoefBQ15[8192];
+  #endif
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_F32_128)
+    extern const float32_t Weights_128[256];
+    extern const float32_t cos_factors_128[128];
+  #endif
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_F32_512)
+    extern const float32_t Weights_512[1024];
+    extern const float32_t cos_factors_512[512];
+  #endif
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_F32_2048)
+    extern const float32_t Weights_2048[4096];
+    extern const float32_t cos_factors_2048[2048];
+  #endif
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_F32_8192)
+    extern const float32_t Weights_8192[16384];
+    extern const float32_t cos_factors_8192[8192];
+  #endif
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q15_128)
+    extern const q15_t WeightsQ15_128[256];
+    extern const q15_t cos_factorsQ15_128[128];
+  #endif
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q15_512)
+    extern const q15_t WeightsQ15_512[1024];
+    extern const q15_t cos_factorsQ15_512[512];
+  #endif
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q15_2048)
+    extern const q15_t WeightsQ15_2048[4096];
+    extern const q15_t cos_factorsQ15_2048[2048];
+  #endif
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q15_8192)
+    extern const q15_t WeightsQ15_8192[16384];
+    extern const q15_t cos_factorsQ15_8192[8192];
+  #endif
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q31_128)
+    extern const q31_t WeightsQ31_128[256];
+    extern const q31_t cos_factorsQ31_128[128];
+  #endif
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q31_512)
+    extern const q31_t WeightsQ31_512[1024];
+    extern const q31_t cos_factorsQ31_512[512];
+  #endif
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q31_2048)
+    extern const q31_t WeightsQ31_2048[4096];
+    extern const q31_t cos_factorsQ31_2048[2048];
+  #endif
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q31_8192)
+    extern const q31_t WeightsQ31_8192[16384];
+    extern const q31_t cos_factorsQ31_8192[8192];
+  #endif
+
+#endif /* if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_TABLES) */
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FAST_ALLOW_TABLES)
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_RECIP_Q15)
+    extern const q15_t armRecipTableQ15[64];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) defined(ARM_ALL_FAST_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_RECIP_Q31)
+    extern const q31_t armRecipTableQ31[64];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) defined(ARM_ALL_FAST_TABLES) */
+
+  /* Tables for Fast Math Sine and Cosine */
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_SIN_F32)
+    extern const float32_t sinTable_f32[FAST_MATH_TABLE_SIZE + 1];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) defined(ARM_ALL_FAST_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_SIN_Q31)
+    extern const q31_t sinTable_q31[FAST_MATH_TABLE_SIZE + 1];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) defined(ARM_ALL_FAST_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_SIN_Q15)
+    extern const q15_t sinTable_q15[FAST_MATH_TABLE_SIZE + 1];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) defined(ARM_ALL_FAST_TABLES) */
+
+  #if defined(ARM_MATH_MVEI)
+     #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_FAST_SQRT_Q31_MVE)
+       extern const q31_t sqrtTable_Q31[256];
+     #endif /* !defined(ARM_DSP_CONFIG_TABLES) defined(ARM_ALL_FAST_TABLES) */
+  #endif
+
+  #if defined(ARM_MATH_MVEI)
+     #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_FAST_SQRT_Q15_MVE)
+       extern const q15_t sqrtTable_Q15[256];
+     #endif /* !defined(ARM_DSP_CONFIG_TABLES) defined(ARM_ALL_FAST_TABLES) */
+  #endif
+
+#endif /* if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FAST_TABLES) */
+
+#if (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)
+       extern const float32_t exp_tab[8];
+       extern const float32_t __logf_lut_f32[8];
+#endif /* (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+#if (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM))
+extern const unsigned char hwLUT[256];
+#endif /* (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM)) */
+
+#endif /*  ARM_COMMON_TABLES_H */
+

+ 76 - 0
libraries/cmsis/cm4/core_support/arm_const_structs.h

@@ -0,0 +1,76 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_const_structs.h
+ * Description:  Constant structs that are initialized for user convenience.
+ *               For example, some can be given as arguments to the arm_cfft_f32() function.
+ *
+ * $Date:        27. January 2017
+ * $Revision:    V.1.5.1
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _ARM_CONST_STRUCTS_H
+#define _ARM_CONST_STRUCTS_H
+
+#include "arm_math.h"
+#include "arm_common_tables.h"
+
+   extern const arm_cfft_instance_f64 arm_cfft_sR_f64_len16;
+   extern const arm_cfft_instance_f64 arm_cfft_sR_f64_len32;
+   extern const arm_cfft_instance_f64 arm_cfft_sR_f64_len64;
+   extern const arm_cfft_instance_f64 arm_cfft_sR_f64_len128;
+   extern const arm_cfft_instance_f64 arm_cfft_sR_f64_len256;
+   extern const arm_cfft_instance_f64 arm_cfft_sR_f64_len512;
+   extern const arm_cfft_instance_f64 arm_cfft_sR_f64_len1024;
+   extern const arm_cfft_instance_f64 arm_cfft_sR_f64_len2048;
+   extern const arm_cfft_instance_f64 arm_cfft_sR_f64_len4096;
+
+   extern const arm_cfft_instance_f32 arm_cfft_sR_f32_len16;
+   extern const arm_cfft_instance_f32 arm_cfft_sR_f32_len32;
+   extern const arm_cfft_instance_f32 arm_cfft_sR_f32_len64;
+   extern const arm_cfft_instance_f32 arm_cfft_sR_f32_len128;
+   extern const arm_cfft_instance_f32 arm_cfft_sR_f32_len256;
+   extern const arm_cfft_instance_f32 arm_cfft_sR_f32_len512;
+   extern const arm_cfft_instance_f32 arm_cfft_sR_f32_len1024;
+   extern const arm_cfft_instance_f32 arm_cfft_sR_f32_len2048;
+   extern const arm_cfft_instance_f32 arm_cfft_sR_f32_len4096;
+
+   extern const arm_cfft_instance_q31 arm_cfft_sR_q31_len16;
+   extern const arm_cfft_instance_q31 arm_cfft_sR_q31_len32;
+   extern const arm_cfft_instance_q31 arm_cfft_sR_q31_len64;
+   extern const arm_cfft_instance_q31 arm_cfft_sR_q31_len128;
+   extern const arm_cfft_instance_q31 arm_cfft_sR_q31_len256;
+   extern const arm_cfft_instance_q31 arm_cfft_sR_q31_len512;
+   extern const arm_cfft_instance_q31 arm_cfft_sR_q31_len1024;
+   extern const arm_cfft_instance_q31 arm_cfft_sR_q31_len2048;
+   extern const arm_cfft_instance_q31 arm_cfft_sR_q31_len4096;
+
+   extern const arm_cfft_instance_q15 arm_cfft_sR_q15_len16;
+   extern const arm_cfft_instance_q15 arm_cfft_sR_q15_len32;
+   extern const arm_cfft_instance_q15 arm_cfft_sR_q15_len64;
+   extern const arm_cfft_instance_q15 arm_cfft_sR_q15_len128;
+   extern const arm_cfft_instance_q15 arm_cfft_sR_q15_len256;
+   extern const arm_cfft_instance_q15 arm_cfft_sR_q15_len512;
+   extern const arm_cfft_instance_q15 arm_cfft_sR_q15_len1024;
+   extern const arm_cfft_instance_q15 arm_cfft_sR_q15_len2048;
+   extern const arm_cfft_instance_q15 arm_cfft_sR_q15_len4096;
+
+#endif

+ 348 - 0
libraries/cmsis/cm4/core_support/arm_helium_utils.h

@@ -0,0 +1,348 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_helium_utils.h
+ * Description:  Utility functions for Helium development
+ *
+ * $Date:        09. September 2019
+ * $Revision:    V.1.5.1
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _ARM_UTILS_HELIUM_H_
+#define _ARM_UTILS_HELIUM_H_
+
+/***************************************
+
+Definitions available for MVEF and MVEI
+
+***************************************/
+#if defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF) || defined(ARM_MATH_MVEI)
+
+#define INACTIVELANE            0 /* inactive lane content */
+
+
+#endif /* defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF) || defined(ARM_MATH_MVEI) */
+
+/***************************************
+
+Definitions available for MVEF only
+
+***************************************/
+#if defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF)
+
+__STATIC_FORCEINLINE float32_t vecAddAcrossF32Mve(float32x4_t in)
+{
+    float32_t acc;
+
+    acc = vgetq_lane(in, 0) + vgetq_lane(in, 1) +
+          vgetq_lane(in, 2) + vgetq_lane(in, 3);
+
+    return acc;
+}
+
+/* newton initial guess */
+#define INVSQRT_MAGIC_F32           0x5f3759df
+
+#define INVSQRT_NEWTON_MVE_F32(invSqrt, xHalf, xStart)\
+{                                                     \
+    float32x4_t tmp;                                  \
+                                                      \
+    /* tmp = xhalf * x * x */                         \
+    tmp = vmulq(xStart, xStart);                      \
+    tmp = vmulq(tmp, xHalf);                          \
+    /* (1.5f - xhalf * x * x) */                      \
+    tmp = vsubq(vdupq_n_f32(1.5f), tmp);              \
+    /* x = x*(1.5f-xhalf*x*x); */                     \
+    invSqrt = vmulq(tmp, xStart);                     \
+}
+#endif /* defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF) */
+
+/***************************************
+
+Definitions available for MVEI only
+
+***************************************/
+#if defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEI)
+
+
+#include "arm_common_tables.h"
+
+/* Following functions are used to transpose matrix in f32 and q31 cases */
+__STATIC_INLINE arm_status arm_mat_trans_32bit_2x2_mve(
+    uint32_t * pDataSrc,
+    uint32_t * pDataDest)
+{
+    static const uint32x4_t vecOffs = { 0, 2, 1, 3 };
+    /*
+     *
+     * | 0   1 |   =>  |  0   2 |
+     * | 2   3 |       |  1   3 |
+     *
+     */
+    uint32x4_t vecIn = vldrwq_u32((uint32_t const *)pDataSrc);
+    vstrwq_scatter_shifted_offset_u32(pDataDest, vecOffs, vecIn);
+
+    return (ARM_MATH_SUCCESS);
+}
+
+__STATIC_INLINE arm_status arm_mat_trans_32bit_3x3_mve(
+    uint32_t * pDataSrc,
+    uint32_t * pDataDest)
+{
+    const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+    const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+    /*
+     *
+     *  | 0   1   2 |       | 0   3   6 |  4 x 32 flattened version | 0   3   6   1 |
+     *  | 3   4   5 |   =>  | 1   4   7 |            =>             | 4   7   2   5 |
+     *  | 6   7   8 |       | 2   5   8 |       (row major)         | 8   .   .   . |
+     *
+     */
+    uint32x4_t vecIn1 = vldrwq_u32((uint32_t const *) pDataSrc);
+    uint32x4_t vecIn2 = vldrwq_u32((uint32_t const *) &pDataSrc[4]);
+
+    vstrwq_scatter_shifted_offset_u32(pDataDest, vecOffs1, vecIn1);
+    vstrwq_scatter_shifted_offset_u32(pDataDest, vecOffs2, vecIn2);
+
+    pDataDest[8] = pDataSrc[8];
+
+    return (ARM_MATH_SUCCESS);
+}
+
+__STATIC_INLINE arm_status arm_mat_trans_32bit_4x4_mve(uint32_t * pDataSrc, uint32_t * pDataDest)
+{
+    /*
+     * 4x4 Matrix transposition
+     * is 4 x de-interleave operation
+     *
+     * 0   1   2   3       0   4   8   12
+     * 4   5   6   7       1   5   9   13
+     * 8   9   10  11      2   6   10  14
+     * 12  13  14  15      3   7   11  15
+     */
+
+    uint32x4x4_t vecIn;
+
+    vecIn = vld4q((uint32_t const *) pDataSrc);
+    vstrwq(pDataDest, vecIn.val[0]);
+    pDataDest += 4;
+    vstrwq(pDataDest, vecIn.val[1]);
+    pDataDest += 4;
+    vstrwq(pDataDest, vecIn.val[2]);
+    pDataDest += 4;
+    vstrwq(pDataDest, vecIn.val[3]);
+
+    return (ARM_MATH_SUCCESS);
+}
+
+
+__STATIC_INLINE arm_status arm_mat_trans_32bit_generic_mve(
+    uint16_t    srcRows,
+    uint16_t    srcCols,
+    uint32_t  * pDataSrc,
+    uint32_t  * pDataDest)
+{
+    uint32x4_t vecOffs;
+    uint32_t  i;
+    uint32_t  blkCnt;
+    uint32_t const *pDataC;
+    uint32_t *pDataDestR;
+    uint32x4_t vecIn;
+
+    vecOffs = vidupq_u32((uint32_t)0, 1);
+    vecOffs = vecOffs * srcCols;
+
+    i = srcCols;
+    do
+    {
+        pDataC = (uint32_t const *) pDataSrc;
+        pDataDestR = pDataDest;
+
+        blkCnt = srcRows >> 2;
+        while (blkCnt > 0U)
+        {
+            vecIn = vldrwq_gather_shifted_offset_u32(pDataC, vecOffs);
+            vstrwq(pDataDestR, vecIn);
+            pDataDestR += 4;
+            pDataC = pDataC + srcCols * 4;
+            /*
+             * Decrement the blockSize loop counter
+             */
+            blkCnt--;
+        }
+
+        /*
+         * tail
+         */
+        blkCnt = srcRows & 3;
+        if (blkCnt > 0U)
+        {
+            mve_pred16_t p0 = vctp32q(blkCnt);
+            vecIn = vldrwq_gather_shifted_offset_u32(pDataC, vecOffs);
+            vstrwq_p(pDataDestR, vecIn, p0);
+        }
+
+        pDataSrc += 1;
+        pDataDest += srcRows;
+    }
+    while (--i);
+
+    return (ARM_MATH_SUCCESS);
+}
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_FAST_SQRT_Q31_MVE)
+__STATIC_INLINE q31x4_t FAST_VSQRT_Q31(q31x4_t vecIn)
+{
+    q63x2_t         vecTmpLL;
+    q31x4_t         vecTmp0, vecTmp1;
+    q31_t           scale;
+    q63_t           tmp64;
+    q31x4_t         vecNrm, vecDst, vecIdx, vecSignBits;
+
+
+    vecSignBits = vclsq(vecIn);
+    vecSignBits = vbicq(vecSignBits, 1);
+    /*
+     * in = in << no_of_sign_bits;
+     */
+    vecNrm = vshlq(vecIn, vecSignBits);
+    /*
+     * index = in >> 24;
+     */
+    vecIdx = vecNrm >> 24;
+    vecIdx = vecIdx << 1;
+
+    vecTmp0 = vldrwq_gather_shifted_offset_s32(sqrtTable_Q31, vecIdx);
+
+    vecIdx = vecIdx + 1;
+
+    vecTmp1 = vldrwq_gather_shifted_offset_s32(sqrtTable_Q31, vecIdx);
+
+    vecTmp1 = vqrdmulhq(vecTmp1, vecNrm);
+    vecTmp0 = vecTmp0 - vecTmp1;
+    vecTmp1 = vqrdmulhq(vecTmp0, vecTmp0);
+    vecTmp1 = vqrdmulhq(vecNrm, vecTmp1);
+    vecTmp1 = vdupq_n_s32(0x18000000) - vecTmp1;
+    vecTmp0 = vqrdmulhq(vecTmp0, vecTmp1);
+    vecTmpLL = vmullbq_int(vecNrm, vecTmp0);
+
+    /*
+     * scale elements 0, 2
+     */
+    scale = 26 + (vecSignBits[0] >> 1);
+    tmp64 = asrl(vecTmpLL[0], scale);
+    vecDst[0] = (q31_t) tmp64;
+
+    scale = 26 + (vecSignBits[2] >> 1);
+    tmp64 = asrl(vecTmpLL[1], scale);
+    vecDst[2] = (q31_t) tmp64;
+
+    vecTmpLL = vmulltq_int(vecNrm, vecTmp0);
+
+    /*
+     * scale elements 1, 3
+     */
+    scale = 26 + (vecSignBits[1] >> 1);
+    tmp64 = asrl(vecTmpLL[0], scale);
+    vecDst[1] = (q31_t) tmp64;
+
+    scale = 26 + (vecSignBits[3] >> 1);
+    tmp64 = asrl(vecTmpLL[1], scale);
+    vecDst[3] = (q31_t) tmp64;
+    /*
+     * set negative values to 0
+     */
+    vecDst = vdupq_m(vecDst, 0, vcmpltq_n_s32(vecIn, 0));
+
+    return vecDst;
+}
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_FAST_SQRT_Q15_MVE)
+__STATIC_INLINE q15x8_t FAST_VSQRT_Q15(q15x8_t vecIn)
+{
+    q31x4_t         vecTmpLev, vecTmpLodd, vecSignL;
+    q15x8_t         vecTmp0, vecTmp1;
+    q15x8_t         vecNrm, vecDst, vecIdx, vecSignBits;
+
+    vecDst = vuninitializedq_s16();
+
+    vecSignBits = vclsq(vecIn);
+    vecSignBits = vbicq(vecSignBits, 1);
+    /*
+     * in = in << no_of_sign_bits;
+     */
+    vecNrm = vshlq(vecIn, vecSignBits);
+
+    vecIdx = vecNrm >> 8;
+    vecIdx = vecIdx << 1;
+
+    vecTmp0 = vldrhq_gather_shifted_offset_s16(sqrtTable_Q15, vecIdx);
+
+    vecIdx = vecIdx + 1;
+
+    vecTmp1 = vldrhq_gather_shifted_offset_s16(sqrtTable_Q15, vecIdx);
+
+    vecTmp1 = vqrdmulhq(vecTmp1, vecNrm);
+    vecTmp0 = vecTmp0 - vecTmp1;
+    vecTmp1 = vqrdmulhq(vecTmp0, vecTmp0);
+    vecTmp1 = vqrdmulhq(vecNrm, vecTmp1);
+    vecTmp1 = vdupq_n_s16(0x1800) - vecTmp1;
+    vecTmp0 = vqrdmulhq(vecTmp0, vecTmp1);
+
+    vecSignBits = vecSignBits >> 1;
+
+    vecTmpLev = vmullbq_int(vecNrm, vecTmp0);
+    vecTmpLodd = vmulltq_int(vecNrm, vecTmp0);
+
+    vecTmp0 = vecSignBits + 10;
+    /*
+     * negate sign to apply register based vshl
+     */
+    vecTmp0 = -vecTmp0;
+
+    /*
+     * shift even elements
+     */
+    vecSignL = vmovlbq(vecTmp0);
+    vecTmpLev = vshlq(vecTmpLev, vecSignL);
+    /*
+     * shift odd elements
+     */
+    vecSignL = vmovltq(vecTmp0);
+    vecTmpLodd = vshlq(vecTmpLodd, vecSignL);
+    /*
+     * merge and narrow odd and even parts
+     */
+    vecDst = vmovnbq_s32(vecDst, vecTmpLev);
+    vecDst = vmovntq_s32(vecDst, vecTmpLodd);
+    /*
+     * set negative values to 0
+     */
+    vecDst = vdupq_m(vecDst, 0, vcmpltq_n_s16(vecIn, 0));
+
+    return vecDst;
+}
+#endif
+
+#endif /* defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEI) */
+
+#endif

+ 8970 - 0
libraries/cmsis/cm4/core_support/arm_math.h

@@ -0,0 +1,8970 @@
+/******************************************************************************
+ * @file     arm_math.h
+ * @brief    Public header file for CMSIS DSP Library
+ * @version  V1.7.0
+ * @date     18. March 2019
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2019 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+   \mainpage CMSIS DSP Software Library
+   *
+   * Introduction
+   * ------------
+   *
+   * This user manual describes the CMSIS DSP software library,
+   * a suite of common signal processing functions for use on Cortex-M and Cortex-A processor
+   * based devices.
+   *
+   * The library is divided into a number of functions each covering a specific category:
+   * - Basic math functions
+   * - Fast math functions
+   * - Complex math functions
+   * - Filtering functions
+   * - Matrix functions
+   * - Transform functions
+   * - Motor control functions
+   * - Statistical functions
+   * - Support functions
+   * - Interpolation functions
+   * - Support Vector Machine functions (SVM)
+   * - Bayes classifier functions
+   * - Distance functions
+   *
+   * The library has generally separate functions for operating on 8-bit integers, 16-bit integers,
+   * 32-bit integer and 32-bit floating-point values.
+   *
+   * Using the Library
+   * ------------
+   *
+   * The library installer contains prebuilt versions of the libraries in the <code>Lib</code> folder.
+   *
+   * Here is the list of pre-built libraries :
+   * - arm_cortexM7lfdp_math.lib (Cortex-M7, Little endian, Double Precision Floating Point Unit)
+   * - arm_cortexM7bfdp_math.lib (Cortex-M7, Big endian, Double Precision Floating Point Unit)
+   * - arm_cortexM7lfsp_math.lib (Cortex-M7, Little endian, Single Precision Floating Point Unit)
+   * - arm_cortexM7bfsp_math.lib (Cortex-M7, Big endian and Single Precision Floating Point Unit on)
+   * - arm_cortexM7l_math.lib (Cortex-M7, Little endian)
+   * - arm_cortexM7b_math.lib (Cortex-M7, Big endian)
+   * - arm_cortexM4lf_math.lib (Cortex-M4, Little endian, Floating Point Unit)
+   * - arm_cortexM4bf_math.lib (Cortex-M4, Big endian, Floating Point Unit)
+   * - arm_cortexM4l_math.lib (Cortex-M4, Little endian)
+   * - arm_cortexM4b_math.lib (Cortex-M4, Big endian)
+   * - arm_cortexM3l_math.lib (Cortex-M3, Little endian)
+   * - arm_cortexM3b_math.lib (Cortex-M3, Big endian)
+   * - arm_cortexM0l_math.lib (Cortex-M0 / Cortex-M0+, Little endian)
+   * - arm_cortexM0b_math.lib (Cortex-M0 / Cortex-M0+, Big endian)
+   * - arm_ARMv8MBLl_math.lib (Armv8-M Baseline, Little endian)
+   * - arm_ARMv8MMLl_math.lib (Armv8-M Mainline, Little endian)
+   * - arm_ARMv8MMLlfsp_math.lib (Armv8-M Mainline, Little endian, Single Precision Floating Point Unit)
+   * - arm_ARMv8MMLld_math.lib (Armv8-M Mainline, Little endian, DSP instructions)
+   * - arm_ARMv8MMLldfsp_math.lib (Armv8-M Mainline, Little endian, DSP instructions, Single Precision Floating Point Unit)
+   *
+   * The library functions are declared in the public file <code>arm_math.h</code> which is placed in the <code>Include</code> folder.
+   * Simply include this file and link the appropriate library in the application and begin calling the library functions. The Library supports single
+   * public header file <code> arm_math.h</code> for Cortex-M cores with little endian and big endian. Same header file will be used for floating point unit(FPU) variants.
+   *
+   *
+   * Examples
+   * --------
+   *
+   * The library ships with a number of examples which demonstrate how to use the library functions.
+   *
+   * Toolchain Support
+   * ------------
+   *
+   * The library is now tested on Fast Models building with cmake.
+   * Core M0, M7, A5 are tested.
+   *
+   *
+   *
+   * Building the Library
+   * ------------
+   *
+   * The library installer contains a project file to rebuild libraries on MDK toolchain in the <code>CMSIS\\DSP\\Projects\\ARM</code> folder.
+   * - arm_cortexM_math.uvprojx
+   *
+   *
+   * The libraries can be built by opening the arm_cortexM_math.uvprojx project in MDK-ARM, selecting a specific target, and defining the optional preprocessor macros detailed above.
+   *
+   * There is also a work in progress cmake build. The README file is giving more details.
+   *
+   * Preprocessor Macros
+   * ------------
+   *
+   * Each library project have different preprocessor macros.
+   *
+   * - ARM_MATH_BIG_ENDIAN:
+   *
+   * Define macro ARM_MATH_BIG_ENDIAN to build the library for big endian targets. By default library builds for little endian targets.
+   *
+   * - ARM_MATH_MATRIX_CHECK:
+   *
+   * Define macro ARM_MATH_MATRIX_CHECK for checking on the input and output sizes of matrices
+   *
+   * - ARM_MATH_ROUNDING:
+   *
+   * Define macro ARM_MATH_ROUNDING for rounding on support functions
+   *
+   * - ARM_MATH_LOOPUNROLL:
+   *
+   * Define macro ARM_MATH_LOOPUNROLL to enable manual loop unrolling in DSP functions
+   *
+   * - ARM_MATH_NEON:
+   *
+   * Define macro ARM_MATH_NEON to enable Neon versions of the DSP functions.
+   * It is not enabled by default when Neon is available because performances are
+   * dependent on the compiler and target architecture.
+   *
+   * - ARM_MATH_NEON_EXPERIMENTAL:
+   *
+   * Define macro ARM_MATH_NEON_EXPERIMENTAL to enable experimental Neon versions of
+   * of some DSP functions. Experimental Neon versions currently do not have better
+   * performances than the scalar versions.
+   *
+   * - ARM_MATH_HELIUM:
+   *
+   * It implies the flags ARM_MATH_MVEF and ARM_MATH_MVEI and ARM_MATH_FLOAT16.
+   *
+   * - ARM_MATH_MVEF:
+   *
+   * Select Helium versions of the f32 algorithms.
+   * It implies ARM_MATH_FLOAT16 and ARM_MATH_MVEI.
+   *
+   * - ARM_MATH_MVEI:
+   *
+   * Select Helium versions of the int and fixed point algorithms.
+   *
+   * - ARM_MATH_FLOAT16:
+   *
+   * Float16 implementations of some algorithms (Requires MVE extension).
+   *
+   * <hr>
+   * CMSIS-DSP in ARM::CMSIS Pack
+   * -----------------------------
+   *
+   * The following files relevant to CMSIS-DSP are present in the <b>ARM::CMSIS</b> Pack directories:
+   * |File/Folder                      |Content                                                                 |
+   * |---------------------------------|------------------------------------------------------------------------|
+   * |\b CMSIS\\Documentation\\DSP     | This documentation                                                     |
+   * |\b CMSIS\\DSP\\DSP_Lib_TestSuite | DSP_Lib test suite                                                     |
+   * |\b CMSIS\\DSP\\Examples          | Example projects demonstrating the usage of the library functions      |
+   * |\b CMSIS\\DSP\\Include           | DSP_Lib include files                                                  |
+   * |\b CMSIS\\DSP\\Lib               | DSP_Lib binaries                                                       |
+   * |\b CMSIS\\DSP\\Projects          | Projects to rebuild DSP_Lib binaries                                   |
+   * |\b CMSIS\\DSP\\Source            | DSP_Lib source files                                                   |
+   *
+   * <hr>
+   * Revision History of CMSIS-DSP
+   * ------------
+   * Please refer to \ref ChangeLog_pg.
+   */
+
+
+/**
+ * @defgroup groupMath Basic Math Functions
+ */
+
+/**
+ * @defgroup groupFastMath Fast Math Functions
+ * This set of functions provides a fast approximation to sine, cosine, and square root.
+ * As compared to most of the other functions in the CMSIS math library, the fast math functions
+ * operate on individual values and not arrays.
+ * There are separate functions for Q15, Q31, and floating-point data.
+ *
+ */
+
+/**
+ * @defgroup groupCmplxMath Complex Math Functions
+ * This set of functions operates on complex data vectors.
+ * The data in the complex arrays is stored in an interleaved fashion
+ * (real, imag, real, imag, ...).
+ * In the API functions, the number of samples in a complex array refers
+ * to the number of complex values; the array contains twice this number of
+ * real values.
+ */
+
+/**
+ * @defgroup groupFilters Filtering Functions
+ */
+
+/**
+ * @defgroup groupMatrix Matrix Functions
+ *
+ * This set of functions provides basic matrix math operations.
+ * The functions operate on matrix data structures.  For example,
+ * the type
+ * definition for the floating-point matrix structure is shown
+ * below:
+ * <pre>
+ *     typedef struct
+ *     {
+ *       uint16_t numRows;     // number of rows of the matrix.
+ *       uint16_t numCols;     // number of columns of the matrix.
+ *       float32_t *pData;     // points to the data of the matrix.
+ *     } arm_matrix_instance_f32;
+ * </pre>
+ * There are similar definitions for Q15 and Q31 data types.
+ *
+ * The structure specifies the size of the matrix and then points to
+ * an array of data.  The array is of size <code>numRows X numCols</code>
+ * and the values are arranged in row order.  That is, the
+ * matrix element (i, j) is stored at:
+ * <pre>
+ *     pData[i*numCols + j]
+ * </pre>
+ *
+ * \par Init Functions
+ * There is an associated initialization function for each type of matrix
+ * data structure.
+ * The initialization function sets the values of the internal structure fields.
+ * Refer to \ref arm_mat_init_f32(), \ref arm_mat_init_q31() and \ref arm_mat_init_q15()
+ * for floating-point, Q31 and Q15 types,  respectively.
+ *
+ * \par
+ * Use of the initialization function is optional. However, if initialization function is used
+ * then the instance structure cannot be placed into a const data section.
+ * To place the instance structure in a const data
+ * section, manually initialize the data structure.  For example:
+ * <pre>
+ * <code>arm_matrix_instance_f32 S = {nRows, nColumns, pData};</code>
+ * <code>arm_matrix_instance_q31 S = {nRows, nColumns, pData};</code>
+ * <code>arm_matrix_instance_q15 S = {nRows, nColumns, pData};</code>
+ * </pre>
+ * where <code>nRows</code> specifies the number of rows, <code>nColumns</code>
+ * specifies the number of columns, and <code>pData</code> points to the
+ * data array.
+ *
+ * \par Size Checking
+ * By default all of the matrix functions perform size checking on the input and
+ * output matrices. For example, the matrix addition function verifies that the
+ * two input matrices and the output matrix all have the same number of rows and
+ * columns. If the size check fails the functions return:
+ * <pre>
+ *     ARM_MATH_SIZE_MISMATCH
+ * </pre>
+ * Otherwise the functions return
+ * <pre>
+ *     ARM_MATH_SUCCESS
+ * </pre>
+ * There is some overhead associated with this matrix size checking.
+ * The matrix size checking is enabled via the \#define
+ * <pre>
+ *     ARM_MATH_MATRIX_CHECK
+ * </pre>
+ * within the library project settings.  By default this macro is defined
+ * and size checking is enabled. By changing the project settings and
+ * undefining this macro size checking is eliminated and the functions
+ * run a bit faster. With size checking disabled the functions always
+ * return <code>ARM_MATH_SUCCESS</code>.
+ */
+
+/**
+ * @defgroup groupTransforms Transform Functions
+ */
+
+/**
+ * @defgroup groupController Controller Functions
+ */
+
+/**
+ * @defgroup groupStats Statistics Functions
+ */
+
+/**
+ * @defgroup groupSupport Support Functions
+ */
+
+/**
+ * @defgroup groupInterpolation Interpolation Functions
+ * These functions perform 1- and 2-dimensional interpolation of data.
+ * Linear interpolation is used for 1-dimensional data and
+ * bilinear interpolation is used for 2-dimensional data.
+ */
+
+/**
+ * @defgroup groupExamples Examples
+ */
+
+/**
+ * @defgroup groupSVM SVM Functions
+ * This set of functions is implementing SVM classification on 2 classes.
+ * The training must be done from scikit-learn. The parameters can be easily
+ * generated from the scikit-learn object. Some examples are given in
+ * DSP/Testing/PatternGeneration/SVM.py
+ *
+ * If more than 2 classes are needed, the functions in this folder
+ * will have to be used, as building blocks, to do multi-class classification.
+ *
+ * No multi-class classification is provided in this SVM folder.
+ *
+ */
+
+
+/**
+ * @defgroup groupBayes Bayesian estimators
+ *
+ * Implement the naive gaussian Bayes estimator.
+ * The training must be done from scikit-learn.
+ *
+ * The parameters can be easily
+ * generated from the scikit-learn object. Some examples are given in
+ * DSP/Testing/PatternGeneration/Bayes.py
+ */
+
+/**
+ * @defgroup groupDistance Distance functions
+ *
+ * Distance functions for use with clustering algorithms.
+ * There are distance functions for float vectors and boolean vectors.
+ *
+ */
+
+
+#ifndef _ARM_MATH_H
+#define _ARM_MATH_H
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+/* Compiler specific diagnostic adjustment */
+#if   defined ( __CC_ARM )
+
+#elif defined ( __ARMCC_VERSION ) && ( __ARMCC_VERSION >= 6010050 )
+
+#elif defined ( __GNUC__ )
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wsign-conversion"
+  #pragma GCC diagnostic ignored "-Wconversion"
+  #pragma GCC diagnostic ignored "-Wunused-parameter"
+
+#elif defined ( __ICCARM__ )
+
+#elif defined ( __TI_ARM__ )
+
+#elif defined ( __CSMC__ )
+
+#elif defined ( __TASKING__ )
+
+#elif defined ( _MSC_VER )
+
+#else
+  #error Unknown compiler
+#endif
+
+
+/* Included for instrinsics definitions */
+#if defined (_MSC_VER )
+#include <stdint.h>
+#define __STATIC_FORCEINLINE static __forceinline
+#define __STATIC_INLINE static __inline
+#define __ALIGNED(x) __declspec(align(x))
+
+#elif defined (__GNUC_PYTHON__)
+#include <stdint.h>
+#define  __ALIGNED(x) __attribute__((aligned(x)))
+#define __STATIC_FORCEINLINE static __attribute__((inline))
+#define __STATIC_INLINE static __attribute__((inline))
+#pragma GCC diagnostic ignored "-Wunused-function"
+#pragma GCC diagnostic ignored "-Wattributes"
+
+#else
+#include "cmsis_compiler.h"
+#endif
+
+
+
+#include <string.h>
+#include <math.h>
+#include <float.h>
+#include <limits.h>
+
+
+#define F64_MAX   ((float64_t)DBL_MAX)
+#define F32_MAX   ((float32_t)FLT_MAX)
+
+#if defined(ARM_MATH_FLOAT16)
+#define F16_MAX   ((float16_t)FLT_MAX)
+#endif
+
+#define F64_MIN   (-DBL_MAX)
+#define F32_MIN   (-FLT_MAX)
+
+#if defined(ARM_MATH_FLOAT16)
+#define F16_MIN   (-(float16_t)FLT_MAX)
+#endif
+
+#define F64_ABSMAX   ((float64_t)DBL_MAX)
+#define F32_ABSMAX   ((float32_t)FLT_MAX)
+
+#if defined(ARM_MATH_FLOAT16)
+#define F16_ABSMAX   ((float16_t)FLT_MAX)
+#endif
+
+#define F64_ABSMIN   ((float64_t)0.0)
+#define F32_ABSMIN   ((float32_t)0.0)
+
+#if defined(ARM_MATH_FLOAT16)
+#define F16_ABSMIN   ((float16_t)0.0)
+#endif
+
+#define Q31_MAX   ((q31_t)(0x7FFFFFFFL))
+#define Q15_MAX   ((q15_t)(0x7FFF))
+#define Q7_MAX    ((q7_t)(0x7F))
+#define Q31_MIN   ((q31_t)(0x80000000L))
+#define Q15_MIN   ((q15_t)(0x8000))
+#define Q7_MIN    ((q7_t)(0x80))
+
+#define Q31_ABSMAX   ((q31_t)(0x7FFFFFFFL))
+#define Q15_ABSMAX   ((q15_t)(0x7FFF))
+#define Q7_ABSMAX    ((q7_t)(0x7F))
+#define Q31_ABSMIN   ((q31_t)0)
+#define Q15_ABSMIN   ((q15_t)0)
+#define Q7_ABSMIN    ((q7_t)0)
+
+/* evaluate ARM DSP feature */
+#if (defined (__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1))
+  #define ARM_MATH_DSP                   1
+#endif
+
+#if defined(ARM_MATH_NEON)
+#include <arm_neon.h>
+#endif
+
+#if defined (ARM_MATH_HELIUM)
+  #define ARM_MATH_MVEF
+  #define ARM_MATH_FLOAT16
+#endif
+
+#if defined (ARM_MATH_MVEF)
+  #define ARM_MATH_MVEI
+  #define ARM_MATH_FLOAT16
+#endif
+
+#if defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF) || defined(ARM_MATH_MVEI)
+#include <arm_mve.h>
+#endif
+
+
+  /**
+   * @brief Macros required for reciprocal calculation in Normalized LMS
+   */
+
+#define DELTA_Q31          ((q31_t)(0x100))
+#define DELTA_Q15          ((q15_t)0x5)
+#define INDEX_MASK         0x0000003F
+#ifndef PI
+  #define PI               3.14159265358979f
+#endif
+
+  /**
+   * @brief Macros required for SINE and COSINE Fast math approximations
+   */
+
+#define FAST_MATH_TABLE_SIZE  512
+#define FAST_MATH_Q31_SHIFT   (32 - 10)
+#define FAST_MATH_Q15_SHIFT   (16 - 10)
+#define CONTROLLER_Q31_SHIFT  (32 - 9)
+#define TABLE_SPACING_Q31     0x400000
+#define TABLE_SPACING_Q15     0x80
+
+  /**
+   * @brief Macros required for SINE and COSINE Controller functions
+   */
+  /* 1.31(q31) Fixed value of 2/360 */
+  /* -1 to +1 is divided into 360 values so total spacing is (2/360) */
+#define INPUT_SPACING         0xB60B61
+
+  /**
+   * @brief Macros for complex numbers
+   */
+
+  /* Dimension C vector space */
+  #define CMPLX_DIM 2
+
+  /**
+   * @brief Error status returned by some functions in the library.
+   */
+
+  typedef enum
+  {
+    ARM_MATH_SUCCESS        =  0,        /**< No error */
+    ARM_MATH_ARGUMENT_ERROR = -1,        /**< One or more arguments are incorrect */
+    ARM_MATH_LENGTH_ERROR   = -2,        /**< Length of data buffer is incorrect */
+    ARM_MATH_SIZE_MISMATCH  = -3,        /**< Size of matrices is not compatible with the operation */
+    ARM_MATH_NANINF         = -4,        /**< Not-a-number (NaN) or infinity is generated */
+    ARM_MATH_SINGULAR       = -5,        /**< Input matrix is singular and cannot be inverted */
+    ARM_MATH_TEST_FAILURE   = -6         /**< Test Failed */
+  } arm_status;
+
+  /**
+   * @brief 8-bit fractional data type in 1.7 format.
+   */
+  typedef int8_t q7_t;
+
+  /**
+   * @brief 16-bit fractional data type in 1.15 format.
+   */
+  typedef int16_t q15_t;
+
+  /**
+   * @brief 32-bit fractional data type in 1.31 format.
+   */
+  typedef int32_t q31_t;
+
+  /**
+   * @brief 64-bit fractional data type in 1.63 format.
+   */
+  typedef int64_t q63_t;
+
+  /**
+   * @brief 32-bit floating-point type definition.
+   */
+  typedef float float32_t;
+
+  /**
+   * @brief 64-bit floating-point type definition.
+   */
+  typedef double float64_t;
+
+  /**
+   * @brief vector types
+   */
+#if defined(ARM_MATH_NEON) || defined (ARM_MATH_MVEI)
+  /**
+   * @brief 64-bit fractional 128-bit vector data type in 1.63 format
+   */
+  typedef int64x2_t q63x2_t;
+
+  /**
+   * @brief 32-bit fractional 128-bit vector data type in 1.31 format.
+   */
+  typedef int32x4_t q31x4_t;
+
+  /**
+   * @brief 16-bit fractional 128-bit vector data type with 16-bit alignement in 1.15 format.
+   */
+  typedef __ALIGNED(2) int16x8_t q15x8_t;
+
+ /**
+   * @brief 8-bit fractional 128-bit vector data type with 8-bit alignement in 1.7 format.
+   */
+  typedef __ALIGNED(1) int8x16_t q7x16_t;
+
+    /**
+   * @brief 32-bit fractional 128-bit vector pair data type in 1.31 format.
+   */
+  typedef int32x4x2_t q31x4x2_t;
+
+  /**
+   * @brief 32-bit fractional 128-bit vector quadruplet data type in 1.31 format.
+   */
+  typedef int32x4x4_t q31x4x4_t;
+
+  /**
+   * @brief 16-bit fractional 128-bit vector pair data type in 1.15 format.
+   */
+  typedef int16x8x2_t q15x8x2_t;
+
+  /**
+   * @brief 16-bit fractional 128-bit vector quadruplet data type in 1.15 format.
+   */
+  typedef int16x8x4_t q15x8x4_t;
+
+  /**
+   * @brief 8-bit fractional 128-bit vector pair data type in 1.7 format.
+   */
+  typedef int8x16x2_t q7x16x2_t;
+
+  /**
+   * @brief 8-bit fractional 128-bit vector quadruplet data type in 1.7 format.
+   */
+   typedef int8x16x4_t q7x16x4_t;
+
+  /**
+   * @brief 32-bit fractional data type in 9.23 format.
+   */
+  typedef int32_t q23_t;
+
+  /**
+   * @brief 32-bit fractional 128-bit vector data type in 9.23 format.
+   */
+  typedef int32x4_t q23x4_t;
+
+  /**
+   * @brief 64-bit status 128-bit vector data type.
+   */
+  typedef int64x2_t status64x2_t;
+
+  /**
+   * @brief 32-bit status 128-bit vector data type.
+   */
+  typedef int32x4_t status32x4_t;
+
+  /**
+   * @brief 16-bit status 128-bit vector data type.
+   */
+  typedef int16x8_t status16x8_t;
+
+  /**
+   * @brief 8-bit status 128-bit vector data type.
+   */
+  typedef int8x16_t status8x16_t;
+
+
+#endif
+
+#if defined(ARM_MATH_NEON) || defined(ARM_MATH_MVEF) /* floating point vector*/
+  /**
+   * @brief 32-bit floating-point 128-bit vector type
+   */
+  typedef float32x4_t f32x4_t;
+
+#if defined(ARM_MATH_FLOAT16)
+  /**
+   * @brief 16-bit floating-point 128-bit vector data type
+   */
+  typedef __ALIGNED(2) float16x8_t f16x8_t;
+#endif
+
+  /**
+   * @brief 32-bit floating-point 128-bit vector pair data type
+   */
+  typedef float32x4x2_t f32x4x2_t;
+
+  /**
+   * @brief 32-bit floating-point 128-bit vector quadruplet data type
+   */
+  typedef float32x4x4_t f32x4x4_t;
+
+#if defined(ARM_MATH_FLOAT16)
+  /**
+   * @brief 16-bit floating-point 128-bit vector pair data type
+   */
+  typedef float16x8x2_t f16x8x2_t;
+
+  /**
+   * @brief 16-bit floating-point 128-bit vector quadruplet data type
+   */
+  typedef float16x8x4_t f16x8x4_t;
+#endif
+
+  /**
+   * @brief 32-bit ubiquitous 128-bit vector data type
+   */
+  typedef union _any32x4_t
+  {
+      float32x4_t     f;
+      int32x4_t       i;
+  } any32x4_t;
+
+#if defined(ARM_MATH_FLOAT16)
+  /**
+   * @brief 16-bit ubiquitous 128-bit vector data type
+   */
+  typedef union _any16x8_t
+  {
+      float16x8_t     f;
+      int16x8_t       i;
+  } any16x8_t;
+#endif
+
+#endif
+
+#if defined(ARM_MATH_NEON)
+  /**
+   * @brief 32-bit fractional 64-bit vector data type in 1.31 format.
+   */
+  typedef int32x2_t  q31x2_t;
+
+  /**
+   * @brief 16-bit fractional 64-bit vector data type in 1.15 format.
+   */
+  typedef  __ALIGNED(2) int16x4_t q15x4_t;
+
+  /**
+   * @brief 8-bit fractional 64-bit vector data type in 1.7 format.
+   */
+  typedef  __ALIGNED(1) int8x8_t q7x8_t;
+
+  /**
+   * @brief 32-bit float 64-bit vector data type.
+   */
+  typedef float32x2_t  f32x2_t;
+
+#if defined(ARM_MATH_FLOAT16)
+  /**
+   * @brief 16-bit float 64-bit vector data type.
+   */
+  typedef  __ALIGNED(2) float16x4_t f16x4_t;
+#endif
+
+  /**
+   * @brief 32-bit floating-point 128-bit vector triplet data type
+   */
+  typedef float32x4x3_t f32x4x3_t;
+
+#if defined(ARM_MATH_FLOAT16)
+  /**
+   * @brief 16-bit floating-point 128-bit vector triplet data type
+   */
+  typedef float16x8x3_t f16x8x3_t;
+#endif
+
+  /**
+   * @brief 32-bit fractional 128-bit vector triplet data type in 1.31 format
+   */
+  typedef int32x4x3_t q31x4x3_t;
+
+  /**
+   * @brief 16-bit fractional 128-bit vector triplet data type in 1.15 format
+   */
+  typedef int16x8x3_t q15x8x3_t;
+
+  /**
+   * @brief 8-bit fractional 128-bit vector triplet data type in 1.7 format
+   */
+  typedef int8x16x3_t q7x16x3_t;
+
+  /**
+   * @brief 32-bit floating-point 64-bit vector pair data type
+   */
+  typedef float32x2x2_t f32x2x2_t;
+
+  /**
+   * @brief 32-bit floating-point 64-bit vector triplet data type
+   */
+  typedef float32x2x3_t f32x2x3_t;
+
+  /**
+   * @brief 32-bit floating-point 64-bit vector quadruplet data type
+   */
+  typedef float32x2x4_t f32x2x4_t;
+
+#if defined(ARM_MATH_FLOAT16)
+  /**
+   * @brief 16-bit floating-point 64-bit vector pair data type
+   */
+  typedef float16x4x2_t f16x4x2_t;
+
+  /**
+   * @brief 16-bit floating-point 64-bit vector triplet data type
+   */
+  typedef float16x4x3_t f16x4x3_t;
+
+  /**
+   * @brief 16-bit floating-point 64-bit vector quadruplet data type
+   */
+  typedef float16x4x4_t f16x4x4_t;
+#endif
+
+  /**
+   * @brief 32-bit fractional 64-bit vector pair data type in 1.31 format
+   */
+  typedef int32x2x2_t q31x2x2_t;
+
+  /**
+   * @brief 32-bit fractional 64-bit vector triplet data type in 1.31 format
+   */
+  typedef int32x2x3_t q31x2x3_t;
+
+  /**
+   * @brief 32-bit fractional 64-bit vector quadruplet data type in 1.31 format
+   */
+  typedef int32x4x3_t q31x2x4_t;
+
+  /**
+   * @brief 16-bit fractional 64-bit vector pair data type in 1.15 format
+   */
+  typedef int16x4x2_t q15x4x2_t;
+
+  /**
+   * @brief 16-bit fractional 64-bit vector triplet data type in 1.15 format
+   */
+  typedef int16x4x2_t q15x4x3_t;
+
+  /**
+   * @brief 16-bit fractional 64-bit vector quadruplet data type in 1.15 format
+   */
+  typedef int16x4x3_t q15x4x4_t;
+
+  /**
+   * @brief 8-bit fractional 64-bit vector pair data type in 1.7 format
+   */
+  typedef int8x8x2_t q7x8x2_t;
+
+  /**
+   * @brief 8-bit fractional 64-bit vector triplet data type in 1.7 format
+   */
+  typedef int8x8x3_t q7x8x3_t;
+
+  /**
+   * @brief 8-bit fractional 64-bit vector quadruplet data type in 1.7 format
+   */
+  typedef int8x8x4_t q7x8x4_t;
+
+  /**
+   * @brief 32-bit ubiquitous 64-bit vector data type
+   */
+  typedef union _any32x2_t
+  {
+      float32x2_t     f;
+      int32x2_t       i;
+  } any32x2_t;
+
+#if defined(ARM_MATH_FLOAT16)
+  /**
+   * @brief 16-bit ubiquitous 64-bit vector data type
+   */
+  typedef union _any16x4_t
+  {
+      float16x4_t     f;
+      int16x4_t       i;
+  } any16x4_t;
+#endif
+
+  /**
+   * @brief 32-bit status 64-bit vector data type.
+   */
+  typedef int32x4_t status32x2_t;
+
+  /**
+   * @brief 16-bit status 64-bit vector data type.
+   */
+  typedef int16x8_t status16x4_t;
+
+  /**
+   * @brief 8-bit status 64-bit vector data type.
+   */
+  typedef int8x16_t status8x8_t;
+
+#endif
+
+
+
+/**
+  @brief definition to read/write two 16 bit values.
+  @deprecated
+ */
+#if   defined ( __CC_ARM )
+  #define __SIMD32_TYPE int32_t __packed
+#elif defined ( __ARMCC_VERSION ) && ( __ARMCC_VERSION >= 6010050 )
+  #define __SIMD32_TYPE int32_t
+#elif defined ( __GNUC__ )
+  #define __SIMD32_TYPE int32_t
+#elif defined ( __ICCARM__ )
+  #define __SIMD32_TYPE int32_t __packed
+#elif defined ( __TI_ARM__ )
+  #define __SIMD32_TYPE int32_t
+#elif defined ( __CSMC__ )
+  #define __SIMD32_TYPE int32_t
+#elif defined ( __TASKING__ )
+  #define __SIMD32_TYPE __un(aligned) int32_t
+#elif defined(_MSC_VER )
+  #define __SIMD32_TYPE int32_t
+#else
+  #error Unknown compiler
+#endif
+
+#define __SIMD32(addr)        (*(__SIMD32_TYPE **) & (addr))
+#define __SIMD32_CONST(addr)  ( (__SIMD32_TYPE * )   (addr))
+#define _SIMD32_OFFSET(addr)  (*(__SIMD32_TYPE * )   (addr))
+#define __SIMD64(addr)        (*(      int64_t **) & (addr))
+
+#define STEP(x) (x) <= 0 ? 0 : 1
+#define SQ(x) ((x) * (x))
+
+/* SIMD replacement */
+
+
+/**
+  @brief         Read 2 Q15 from Q15 pointer.
+  @param[in]     pQ15      points to input value
+  @return        Q31 value
+ */
+__STATIC_FORCEINLINE q31_t read_q15x2 (
+  q15_t * pQ15)
+{
+  q31_t val;
+
+#ifdef __ARM_FEATURE_UNALIGNED
+  memcpy (&val, pQ15, 4);
+#else
+  val = (pQ15[1] << 16) | (pQ15[0] & 0x0FFFF) ;
+#endif
+
+  return (val);
+}
+
+/**
+  @brief         Read 2 Q15 from Q15 pointer and increment pointer afterwards.
+  @param[in]     pQ15      points to input value
+  @return        Q31 value
+ */
+__STATIC_FORCEINLINE q31_t read_q15x2_ia (
+  q15_t ** pQ15)
+{
+  q31_t val;
+
+#ifdef __ARM_FEATURE_UNALIGNED
+  memcpy (&val, *pQ15, 4);
+#else
+  val = ((*pQ15)[1] << 16) | ((*pQ15)[0] & 0x0FFFF);
+#endif
+
+ *pQ15 += 2;
+ return (val);
+}
+
+/**
+  @brief         Read 2 Q15 from Q15 pointer and decrement pointer afterwards.
+  @param[in]     pQ15      points to input value
+  @return        Q31 value
+ */
+__STATIC_FORCEINLINE q31_t read_q15x2_da (
+  q15_t ** pQ15)
+{
+  q31_t val;
+
+#ifdef __ARM_FEATURE_UNALIGNED
+  memcpy (&val, *pQ15, 4);
+#else
+  val = ((*pQ15)[1] << 16) | ((*pQ15)[0] & 0x0FFFF);
+#endif
+
+  *pQ15 -= 2;
+  return (val);
+}
+
+/**
+  @brief         Write 2 Q15 to Q15 pointer and increment pointer afterwards.
+  @param[in]     pQ15      points to input value
+  @param[in]     value     Q31 value
+  @return        none
+ */
+__STATIC_FORCEINLINE void write_q15x2_ia (
+  q15_t ** pQ15,
+  q31_t    value)
+{
+  q31_t val = value;
+#ifdef __ARM_FEATURE_UNALIGNED
+  memcpy (*pQ15, &val, 4);
+#else
+  (*pQ15)[0] = (val & 0x0FFFF);
+  (*pQ15)[1] = (val >> 16) & 0x0FFFF;
+#endif
+
+ *pQ15 += 2;
+}
+
+/**
+  @brief         Write 2 Q15 to Q15 pointer.
+  @param[in]     pQ15      points to input value
+  @param[in]     value     Q31 value
+  @return        none
+ */
+__STATIC_FORCEINLINE void write_q15x2 (
+  q15_t * pQ15,
+  q31_t   value)
+{
+  q31_t val = value;
+
+#ifdef __ARM_FEATURE_UNALIGNED
+  memcpy (pQ15, &val, 4);
+#else
+  pQ15[0] = val & 0x0FFFF;
+  pQ15[1] = val >> 16;
+#endif
+}
+
+
+/**
+  @brief         Read 4 Q7 from Q7 pointer and increment pointer afterwards.
+  @param[in]     pQ7       points to input value
+  @return        Q31 value
+ */
+__STATIC_FORCEINLINE q31_t read_q7x4_ia (
+  q7_t ** pQ7)
+{
+  q31_t val;
+
+
+#ifdef __ARM_FEATURE_UNALIGNED
+  memcpy (&val, *pQ7, 4);
+#else
+  val =(((*pQ7)[3] & 0x0FF) << 24)  | (((*pQ7)[2] & 0x0FF) << 16)  | (((*pQ7)[1] & 0x0FF) << 8)  | ((*pQ7)[0] & 0x0FF);
+#endif
+
+  *pQ7 += 4;
+
+  return (val);
+}
+
+/**
+  @brief         Read 4 Q7 from Q7 pointer and decrement pointer afterwards.
+  @param[in]     pQ7       points to input value
+  @return        Q31 value
+ */
+__STATIC_FORCEINLINE q31_t read_q7x4_da (
+  q7_t ** pQ7)
+{
+  q31_t val;
+#ifdef __ARM_FEATURE_UNALIGNED
+  memcpy (&val, *pQ7, 4);
+#else
+  val = ((((*pQ7)[3]) & 0x0FF) << 24) | ((((*pQ7)[2]) & 0x0FF) << 16)   | ((((*pQ7)[1]) & 0x0FF) << 8)  | ((*pQ7)[0] & 0x0FF);
+#endif
+  *pQ7 -= 4;
+
+  return (val);
+}
+
+/**
+  @brief         Write 4 Q7 to Q7 pointer and increment pointer afterwards.
+  @param[in]     pQ7       points to input value
+  @param[in]     value     Q31 value
+  @return        none
+ */
+__STATIC_FORCEINLINE void write_q7x4_ia (
+  q7_t ** pQ7,
+  q31_t   value)
+{
+  q31_t val = value;
+#ifdef __ARM_FEATURE_UNALIGNED
+  memcpy (*pQ7, &val, 4);
+#else
+  (*pQ7)[0] = val & 0x0FF;
+  (*pQ7)[1] = (val >> 8) & 0x0FF;
+  (*pQ7)[2] = (val >> 16) & 0x0FF;
+  (*pQ7)[3] = (val >> 24) & 0x0FF;
+
+#endif
+  *pQ7 += 4;
+}
+
+/*
+
+Normally those kind of definitions are in a compiler file
+in Core or Core_A.
+
+But for MSVC compiler it is a bit special. The goal is very specific
+to CMSIS-DSP and only to allow the use of this library from other
+systems like Python or Matlab.
+
+MSVC is not going to be used to cross-compile to ARM. So, having a MSVC
+compiler file in Core or Core_A would not make sense.
+
+*/
+#if defined ( _MSC_VER ) || defined(__GNUC_PYTHON__)
+    __STATIC_FORCEINLINE uint8_t __CLZ(uint32_t data)
+    {
+      if (data == 0U) { return 32U; }
+
+      uint32_t count = 0U;
+      uint32_t mask = 0x80000000U;
+
+      while ((data & mask) == 0U)
+      {
+        count += 1U;
+        mask = mask >> 1U;
+      }
+      return count;
+    }
+
+  __STATIC_FORCEINLINE int32_t __SSAT(int32_t val, uint32_t sat)
+  {
+    if ((sat >= 1U) && (sat <= 32U))
+    {
+      const int32_t max = (int32_t)((1U << (sat - 1U)) - 1U);
+      const int32_t min = -1 - max ;
+      if (val > max)
+      {
+        return max;
+      }
+      else if (val < min)
+      {
+        return min;
+      }
+    }
+    return val;
+  }
+
+  __STATIC_FORCEINLINE uint32_t __USAT(int32_t val, uint32_t sat)
+  {
+    if (sat <= 31U)
+    {
+      const uint32_t max = ((1U << sat) - 1U);
+      if (val > (int32_t)max)
+      {
+        return max;
+      }
+      else if (val < 0)
+      {
+        return 0U;
+      }
+    }
+    return (uint32_t)val;
+  }
+#endif
+
+#ifndef ARM_MATH_DSP
+  /**
+   * @brief definition to pack two 16 bit values.
+   */
+  #define __PKHBT(ARG1, ARG2, ARG3) ( (((int32_t)(ARG1) <<    0) & (int32_t)0x0000FFFF) | \
+                                      (((int32_t)(ARG2) << ARG3) & (int32_t)0xFFFF0000)  )
+  #define __PKHTB(ARG1, ARG2, ARG3) ( (((int32_t)(ARG1) <<    0) & (int32_t)0xFFFF0000) | \
+                                      (((int32_t)(ARG2) >> ARG3) & (int32_t)0x0000FFFF)  )
+#endif
+
+   /**
+   * @brief definition to pack four 8 bit values.
+   */
+#ifndef ARM_MATH_BIG_ENDIAN
+  #define __PACKq7(v0,v1,v2,v3) ( (((int32_t)(v0) <<  0) & (int32_t)0x000000FF) | \
+                                  (((int32_t)(v1) <<  8) & (int32_t)0x0000FF00) | \
+                                  (((int32_t)(v2) << 16) & (int32_t)0x00FF0000) | \
+                                  (((int32_t)(v3) << 24) & (int32_t)0xFF000000)  )
+#else
+  #define __PACKq7(v0,v1,v2,v3) ( (((int32_t)(v3) <<  0) & (int32_t)0x000000FF) | \
+                                  (((int32_t)(v2) <<  8) & (int32_t)0x0000FF00) | \
+                                  (((int32_t)(v1) << 16) & (int32_t)0x00FF0000) | \
+                                  (((int32_t)(v0) << 24) & (int32_t)0xFF000000)  )
+#endif
+
+
+  /**
+   * @brief Clips Q63 to Q31 values.
+   */
+  __STATIC_FORCEINLINE q31_t clip_q63_to_q31(
+  q63_t x)
+  {
+    return ((q31_t) (x >> 32) != ((q31_t) x >> 31)) ?
+      ((0x7FFFFFFF ^ ((q31_t) (x >> 63)))) : (q31_t) x;
+  }
+
+  /**
+   * @brief Clips Q63 to Q15 values.
+   */
+  __STATIC_FORCEINLINE q15_t clip_q63_to_q15(
+  q63_t x)
+  {
+    return ((q31_t) (x >> 32) != ((q31_t) x >> 31)) ?
+      ((0x7FFF ^ ((q15_t) (x >> 63)))) : (q15_t) (x >> 15);
+  }
+
+  /**
+   * @brief Clips Q31 to Q7 values.
+   */
+  __STATIC_FORCEINLINE q7_t clip_q31_to_q7(
+  q31_t x)
+  {
+    return ((q31_t) (x >> 24) != ((q31_t) x >> 23)) ?
+      ((0x7F ^ ((q7_t) (x >> 31)))) : (q7_t) x;
+  }
+
+  /**
+   * @brief Clips Q31 to Q15 values.
+   */
+  __STATIC_FORCEINLINE q15_t clip_q31_to_q15(
+  q31_t x)
+  {
+    return ((q31_t) (x >> 16) != ((q31_t) x >> 15)) ?
+      ((0x7FFF ^ ((q15_t) (x >> 31)))) : (q15_t) x;
+  }
+
+  /**
+   * @brief Multiplies 32 X 64 and returns 32 bit result in 2.30 format.
+   */
+  __STATIC_FORCEINLINE q63_t mult32x64(
+  q63_t x,
+  q31_t y)
+  {
+    return ((((q63_t) (x & 0x00000000FFFFFFFF) * y) >> 32) +
+            (((q63_t) (x >> 32)                * y)      )  );
+  }
+
+  /**
+   * @brief Function to Calculates 1/in (reciprocal) value of Q31 Data type.
+   */
+  __STATIC_FORCEINLINE uint32_t arm_recip_q31(
+        q31_t in,
+        q31_t * dst,
+  const q31_t * pRecipTable)
+  {
+    q31_t out;
+    uint32_t tempVal;
+    uint32_t index, i;
+    uint32_t signBits;
+
+    if (in > 0)
+    {
+      signBits = ((uint32_t) (__CLZ( in) - 1));
+    }
+    else
+    {
+      signBits = ((uint32_t) (__CLZ(-in) - 1));
+    }
+
+    /* Convert input sample to 1.31 format */
+    in = (in << signBits);
+
+    /* calculation of index for initial approximated Val */
+    index = (uint32_t)(in >> 24);
+    index = (index & INDEX_MASK);
+
+    /* 1.31 with exp 1 */
+    out = pRecipTable[index];
+
+    /* calculation of reciprocal value */
+    /* running approximation for two iterations */
+    for (i = 0U; i < 2U; i++)
+    {
+      tempVal = (uint32_t) (((q63_t) in * out) >> 31);
+      tempVal = 0x7FFFFFFFu - tempVal;
+      /*      1.31 with exp 1 */
+      /* out = (q31_t) (((q63_t) out * tempVal) >> 30); */
+      out = clip_q63_to_q31(((q63_t) out * tempVal) >> 30);
+    }
+
+    /* write output */
+    *dst = out;
+
+    /* return num of signbits of out = 1/in value */
+    return (signBits + 1U);
+  }
+
+
+  /**
+   * @brief Function to Calculates 1/in (reciprocal) value of Q15 Data type.
+   */
+  __STATIC_FORCEINLINE uint32_t arm_recip_q15(
+        q15_t in,
+        q15_t * dst,
+  const q15_t * pRecipTable)
+  {
+    q15_t out = 0;
+    uint32_t tempVal = 0;
+    uint32_t index = 0, i = 0;
+    uint32_t signBits = 0;
+
+    if (in > 0)
+    {
+      signBits = ((uint32_t)(__CLZ( in) - 17));
+    }
+    else
+    {
+      signBits = ((uint32_t)(__CLZ(-in) - 17));
+    }
+
+    /* Convert input sample to 1.15 format */
+    in = (in << signBits);
+
+    /* calculation of index for initial approximated Val */
+    index = (uint32_t)(in >>  8);
+    index = (index & INDEX_MASK);
+
+    /*      1.15 with exp 1  */
+    out = pRecipTable[index];
+
+    /* calculation of reciprocal value */
+    /* running approximation for two iterations */
+    for (i = 0U; i < 2U; i++)
+    {
+      tempVal = (uint32_t) (((q31_t) in * out) >> 15);
+      tempVal = 0x7FFFu - tempVal;
+      /*      1.15 with exp 1 */
+      out = (q15_t) (((q31_t) out * tempVal) >> 14);
+      /* out = clip_q31_to_q15(((q31_t) out * tempVal) >> 14); */
+    }
+
+    /* write output */
+    *dst = out;
+
+    /* return num of signbits of out = 1/in value */
+    return (signBits + 1);
+  }
+
+/**
+ * @brief Integer exponentiation
+ * @param[in]    x           value
+ * @param[in]    nb          integer exponent >= 1
+ * @return x^nb
+ *
+ */
+__STATIC_INLINE float32_t arm_exponent_f32(float32_t x, int32_t nb)
+{
+    float32_t r = x;
+    nb --;
+    while(nb > 0)
+    {
+        r = r * x;
+        nb--;
+    }
+    return(r);
+}
+
+/**
+ * @brief  64-bit to 32-bit unsigned normalization
+ * @param[in]  in           is input unsigned long long value
+ * @param[out] normalized   is the 32-bit normalized value
+ * @param[out] norm         is norm scale
+ */
+__STATIC_INLINE  void arm_norm_64_to_32u(uint64_t in, int32_t * normalized, int32_t *norm)
+{
+    int32_t     n1;
+    int32_t     hi = (int32_t) (in >> 32);
+    int32_t     lo = (int32_t) ((in << 32) >> 32);
+
+    n1 = __CLZ(hi) - 32;
+    if (!n1)
+    {
+        /*
+         * input fits in 32-bit
+         */
+        n1 = __CLZ(lo);
+        if (!n1)
+        {
+            /*
+             * MSB set, need to scale down by 1
+             */
+            *norm = -1;
+            *normalized = (((uint32_t) lo) >> 1);
+        } else
+        {
+            if (n1 == 32)
+            {
+                /*
+                 * input is zero
+                 */
+                *norm = 0;
+                *normalized = 0;
+            } else
+            {
+                /*
+                 * 32-bit normalization
+                 */
+                *norm = n1 - 1;
+                *normalized = lo << *norm;
+            }
+        }
+    } else
+    {
+        /*
+         * input fits in 64-bit
+         */
+        n1 = 1 - n1;
+        *norm = -n1;
+        /*
+         * 64 bit normalization
+         */
+        *normalized = (((uint32_t) lo) >> n1) | (hi << (32 - n1));
+    }
+}
+
+__STATIC_INLINE q31_t arm_div_q63_to_q31(q63_t num, q31_t den)
+{
+    q31_t   result;
+    uint64_t   absNum;
+    int32_t   normalized;
+    int32_t   norm;
+
+    /*
+     * if sum fits in 32bits
+     * avoid costly 64-bit division
+     */
+    absNum = num > 0 ? num : -num;
+    arm_norm_64_to_32u(absNum, &normalized, &norm);
+    if (norm > 0)
+        /*
+         * 32-bit division
+         */
+        result = (q31_t) num / den;
+    else
+        /*
+         * 64-bit division
+         */
+        result = (q31_t) (num / den);
+
+    return result;
+}
+
+
+/*
+ * @brief C custom defined intrinsic functions
+ */
+#if !defined (ARM_MATH_DSP)
+
+  /*
+   * @brief C custom defined QADD8
+   */
+  __STATIC_FORCEINLINE uint32_t __QADD8(
+  uint32_t x,
+  uint32_t y)
+  {
+    q31_t r, s, t, u;
+
+    r = __SSAT(((((q31_t)x << 24) >> 24) + (((q31_t)y << 24) >> 24)), 8) & (int32_t)0x000000FF;
+    s = __SSAT(((((q31_t)x << 16) >> 24) + (((q31_t)y << 16) >> 24)), 8) & (int32_t)0x000000FF;
+    t = __SSAT(((((q31_t)x <<  8) >> 24) + (((q31_t)y <<  8) >> 24)), 8) & (int32_t)0x000000FF;
+    u = __SSAT(((((q31_t)x      ) >> 24) + (((q31_t)y      ) >> 24)), 8) & (int32_t)0x000000FF;
+
+    return ((uint32_t)((u << 24) | (t << 16) | (s <<  8) | (r      )));
+  }
+
+
+  /*
+   * @brief C custom defined QSUB8
+   */
+  __STATIC_FORCEINLINE uint32_t __QSUB8(
+  uint32_t x,
+  uint32_t y)
+  {
+    q31_t r, s, t, u;
+
+    r = __SSAT(((((q31_t)x << 24) >> 24) - (((q31_t)y << 24) >> 24)), 8) & (int32_t)0x000000FF;
+    s = __SSAT(((((q31_t)x << 16) >> 24) - (((q31_t)y << 16) >> 24)), 8) & (int32_t)0x000000FF;
+    t = __SSAT(((((q31_t)x <<  8) >> 24) - (((q31_t)y <<  8) >> 24)), 8) & (int32_t)0x000000FF;
+    u = __SSAT(((((q31_t)x      ) >> 24) - (((q31_t)y      ) >> 24)), 8) & (int32_t)0x000000FF;
+
+    return ((uint32_t)((u << 24) | (t << 16) | (s <<  8) | (r      )));
+  }
+
+
+  /*
+   * @brief C custom defined QADD16
+   */
+  __STATIC_FORCEINLINE uint32_t __QADD16(
+  uint32_t x,
+  uint32_t y)
+  {
+/*  q31_t r,     s;  without initialisation 'arm_offset_q15 test' fails  but 'intrinsic' tests pass! for armCC */
+    q31_t r = 0, s = 0;
+
+    r = __SSAT(((((q31_t)x << 16) >> 16) + (((q31_t)y << 16) >> 16)), 16) & (int32_t)0x0000FFFF;
+    s = __SSAT(((((q31_t)x      ) >> 16) + (((q31_t)y      ) >> 16)), 16) & (int32_t)0x0000FFFF;
+
+    return ((uint32_t)((s << 16) | (r      )));
+  }
+
+
+  /*
+   * @brief C custom defined SHADD16
+   */
+  __STATIC_FORCEINLINE uint32_t __SHADD16(
+  uint32_t x,
+  uint32_t y)
+  {
+    q31_t r, s;
+
+    r = (((((q31_t)x << 16) >> 16) + (((q31_t)y << 16) >> 16)) >> 1) & (int32_t)0x0000FFFF;
+    s = (((((q31_t)x      ) >> 16) + (((q31_t)y      ) >> 16)) >> 1) & (int32_t)0x0000FFFF;
+
+    return ((uint32_t)((s << 16) | (r      )));
+  }
+
+
+  /*
+   * @brief C custom defined QSUB16
+   */
+  __STATIC_FORCEINLINE uint32_t __QSUB16(
+  uint32_t x,
+  uint32_t y)
+  {
+    q31_t r, s;
+
+    r = __SSAT(((((q31_t)x << 16) >> 16) - (((q31_t)y << 16) >> 16)), 16) & (int32_t)0x0000FFFF;
+    s = __SSAT(((((q31_t)x      ) >> 16) - (((q31_t)y      ) >> 16)), 16) & (int32_t)0x0000FFFF;
+
+    return ((uint32_t)((s << 16) | (r      )));
+  }
+
+
+  /*
+   * @brief C custom defined SHSUB16
+   */
+  __STATIC_FORCEINLINE uint32_t __SHSUB16(
+  uint32_t x,
+  uint32_t y)
+  {
+    q31_t r, s;
+
+    r = (((((q31_t)x << 16) >> 16) - (((q31_t)y << 16) >> 16)) >> 1) & (int32_t)0x0000FFFF;
+    s = (((((q31_t)x      ) >> 16) - (((q31_t)y      ) >> 16)) >> 1) & (int32_t)0x0000FFFF;
+
+    return ((uint32_t)((s << 16) | (r      )));
+  }
+
+
+  /*
+   * @brief C custom defined QASX
+   */
+  __STATIC_FORCEINLINE uint32_t __QASX(
+  uint32_t x,
+  uint32_t y)
+  {
+    q31_t r, s;
+
+    r = __SSAT(((((q31_t)x << 16) >> 16) - (((q31_t)y      ) >> 16)), 16) & (int32_t)0x0000FFFF;
+    s = __SSAT(((((q31_t)x      ) >> 16) + (((q31_t)y << 16) >> 16)), 16) & (int32_t)0x0000FFFF;
+
+    return ((uint32_t)((s << 16) | (r      )));
+  }
+
+
+  /*
+   * @brief C custom defined SHASX
+   */
+  __STATIC_FORCEINLINE uint32_t __SHASX(
+  uint32_t x,
+  uint32_t y)
+  {
+    q31_t r, s;
+
+    r = (((((q31_t)x << 16) >> 16) - (((q31_t)y      ) >> 16)) >> 1) & (int32_t)0x0000FFFF;
+    s = (((((q31_t)x      ) >> 16) + (((q31_t)y << 16) >> 16)) >> 1) & (int32_t)0x0000FFFF;
+
+    return ((uint32_t)((s << 16) | (r      )));
+  }
+
+
+  /*
+   * @brief C custom defined QSAX
+   */
+  __STATIC_FORCEINLINE uint32_t __QSAX(
+  uint32_t x,
+  uint32_t y)
+  {
+    q31_t r, s;
+
+    r = __SSAT(((((q31_t)x << 16) >> 16) + (((q31_t)y      ) >> 16)), 16) & (int32_t)0x0000FFFF;
+    s = __SSAT(((((q31_t)x      ) >> 16) - (((q31_t)y << 16) >> 16)), 16) & (int32_t)0x0000FFFF;
+
+    return ((uint32_t)((s << 16) | (r      )));
+  }
+
+
+  /*
+   * @brief C custom defined SHSAX
+   */
+  __STATIC_FORCEINLINE uint32_t __SHSAX(
+  uint32_t x,
+  uint32_t y)
+  {
+    q31_t r, s;
+
+    r = (((((q31_t)x << 16) >> 16) + (((q31_t)y      ) >> 16)) >> 1) & (int32_t)0x0000FFFF;
+    s = (((((q31_t)x      ) >> 16) - (((q31_t)y << 16) >> 16)) >> 1) & (int32_t)0x0000FFFF;
+
+    return ((uint32_t)((s << 16) | (r      )));
+  }
+
+
+  /*
+   * @brief C custom defined SMUSDX
+   */
+  __STATIC_FORCEINLINE uint32_t __SMUSDX(
+  uint32_t x,
+  uint32_t y)
+  {
+    return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y      ) >> 16)) -
+                       ((((q31_t)x      ) >> 16) * (((q31_t)y << 16) >> 16))   ));
+  }
+
+  /*
+   * @brief C custom defined SMUADX
+   */
+  __STATIC_FORCEINLINE uint32_t __SMUADX(
+  uint32_t x,
+  uint32_t y)
+  {
+    return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y      ) >> 16)) +
+                       ((((q31_t)x      ) >> 16) * (((q31_t)y << 16) >> 16))   ));
+  }
+
+
+  /*
+   * @brief C custom defined QADD
+   */
+  __STATIC_FORCEINLINE int32_t __QADD(
+  int32_t x,
+  int32_t y)
+  {
+    return ((int32_t)(clip_q63_to_q31((q63_t)x + (q31_t)y)));
+  }
+
+
+  /*
+   * @brief C custom defined QSUB
+   */
+  __STATIC_FORCEINLINE int32_t __QSUB(
+  int32_t x,
+  int32_t y)
+  {
+    return ((int32_t)(clip_q63_to_q31((q63_t)x - (q31_t)y)));
+  }
+
+
+  /*
+   * @brief C custom defined SMLAD
+   */
+  __STATIC_FORCEINLINE uint32_t __SMLAD(
+  uint32_t x,
+  uint32_t y,
+  uint32_t sum)
+  {
+    return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y << 16) >> 16)) +
+                       ((((q31_t)x      ) >> 16) * (((q31_t)y      ) >> 16)) +
+                       ( ((q31_t)sum    )                                  )   ));
+  }
+
+
+  /*
+   * @brief C custom defined SMLADX
+   */
+  __STATIC_FORCEINLINE uint32_t __SMLADX(
+  uint32_t x,
+  uint32_t y,
+  uint32_t sum)
+  {
+    return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y      ) >> 16)) +
+                       ((((q31_t)x      ) >> 16) * (((q31_t)y << 16) >> 16)) +
+                       ( ((q31_t)sum    )                                  )   ));
+  }
+
+
+  /*
+   * @brief C custom defined SMLSDX
+   */
+  __STATIC_FORCEINLINE uint32_t __SMLSDX(
+  uint32_t x,
+  uint32_t y,
+  uint32_t sum)
+  {
+    return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y      ) >> 16)) -
+                       ((((q31_t)x      ) >> 16) * (((q31_t)y << 16) >> 16)) +
+                       ( ((q31_t)sum    )                                  )   ));
+  }
+
+
+  /*
+   * @brief C custom defined SMLALD
+   */
+  __STATIC_FORCEINLINE uint64_t __SMLALD(
+  uint32_t x,
+  uint32_t y,
+  uint64_t sum)
+  {
+/*  return (sum + ((q15_t) (x >> 16) * (q15_t) (y >> 16)) + ((q15_t) x * (q15_t) y)); */
+    return ((uint64_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y << 16) >> 16)) +
+                       ((((q31_t)x      ) >> 16) * (((q31_t)y      ) >> 16)) +
+                       ( ((q63_t)sum    )                                  )   ));
+  }
+
+
+  /*
+   * @brief C custom defined SMLALDX
+   */
+  __STATIC_FORCEINLINE uint64_t __SMLALDX(
+  uint32_t x,
+  uint32_t y,
+  uint64_t sum)
+  {
+/*  return (sum + ((q15_t) (x >> 16) * (q15_t) y)) + ((q15_t) x * (q15_t) (y >> 16)); */
+    return ((uint64_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y      ) >> 16)) +
+                       ((((q31_t)x      ) >> 16) * (((q31_t)y << 16) >> 16)) +
+                       ( ((q63_t)sum    )                                  )   ));
+  }
+
+
+  /*
+   * @brief C custom defined SMUAD
+   */
+  __STATIC_FORCEINLINE uint32_t __SMUAD(
+  uint32_t x,
+  uint32_t y)
+  {
+    return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y << 16) >> 16)) +
+                       ((((q31_t)x      ) >> 16) * (((q31_t)y      ) >> 16))   ));
+  }
+
+
+  /*
+   * @brief C custom defined SMUSD
+   */
+  __STATIC_FORCEINLINE uint32_t __SMUSD(
+  uint32_t x,
+  uint32_t y)
+  {
+    return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y << 16) >> 16)) -
+                       ((((q31_t)x      ) >> 16) * (((q31_t)y      ) >> 16))   ));
+  }
+
+
+  /*
+   * @brief C custom defined SXTB16
+   */
+  __STATIC_FORCEINLINE uint32_t __SXTB16(
+  uint32_t x)
+  {
+    return ((uint32_t)(((((q31_t)x << 24) >> 24) & (q31_t)0x0000FFFF) |
+                       ((((q31_t)x <<  8) >>  8) & (q31_t)0xFFFF0000)  ));
+  }
+
+  /*
+   * @brief C custom defined SMMLA
+   */
+  __STATIC_FORCEINLINE int32_t __SMMLA(
+  int32_t x,
+  int32_t y,
+  int32_t sum)
+  {
+    return (sum + (int32_t) (((int64_t) x * y) >> 32));
+  }
+
+#endif /* !defined (ARM_MATH_DSP) */
+
+
+  /**
+   * @brief Instance structure for the Q7 FIR filter.
+   */
+  typedef struct
+  {
+          uint16_t numTaps;        /**< number of filter coefficients in the filter. */
+          q7_t *pState;            /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
+    const q7_t *pCoeffs;           /**< points to the coefficient array. The array is of length numTaps.*/
+  } arm_fir_instance_q7;
+
+  /**
+   * @brief Instance structure for the Q15 FIR filter.
+   */
+  typedef struct
+  {
+          uint16_t numTaps;         /**< number of filter coefficients in the filter. */
+          q15_t *pState;            /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
+    const q15_t *pCoeffs;           /**< points to the coefficient array. The array is of length numTaps.*/
+  } arm_fir_instance_q15;
+
+  /**
+   * @brief Instance structure for the Q31 FIR filter.
+   */
+  typedef struct
+  {
+          uint16_t numTaps;         /**< number of filter coefficients in the filter. */
+          q31_t *pState;            /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
+    const q31_t *pCoeffs;           /**< points to the coefficient array. The array is of length numTaps. */
+  } arm_fir_instance_q31;
+
+  /**
+   * @brief Instance structure for the floating-point FIR filter.
+   */
+  typedef struct
+  {
+          uint16_t numTaps;     /**< number of filter coefficients in the filter. */
+          float32_t *pState;    /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
+    const float32_t *pCoeffs;   /**< points to the coefficient array. The array is of length numTaps. */
+  } arm_fir_instance_f32;
+
+  /**
+   * @brief Processing function for the Q7 FIR filter.
+   * @param[in]  S          points to an instance of the Q7 FIR filter structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_fir_q7(
+  const arm_fir_instance_q7 * S,
+  const q7_t * pSrc,
+        q7_t * pDst,
+        uint32_t blockSize);
+
+  /**
+   * @brief  Initialization function for the Q7 FIR filter.
+   * @param[in,out] S          points to an instance of the Q7 FIR structure.
+   * @param[in]     numTaps    Number of filter coefficients in the filter.
+   * @param[in]     pCoeffs    points to the filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     blockSize  number of samples that are processed.
+   */
+  void arm_fir_init_q7(
+        arm_fir_instance_q7 * S,
+        uint16_t numTaps,
+  const q7_t * pCoeffs,
+        q7_t * pState,
+        uint32_t blockSize);
+
+  /**
+   * @brief Processing function for the Q15 FIR filter.
+   * @param[in]  S          points to an instance of the Q15 FIR structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_fir_q15(
+  const arm_fir_instance_q15 * S,
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+  /**
+   * @brief Processing function for the fast Q15 FIR filter (fast version).
+   * @param[in]  S          points to an instance of the Q15 FIR filter structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_fir_fast_q15(
+  const arm_fir_instance_q15 * S,
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+  /**
+   * @brief  Initialization function for the Q15 FIR filter.
+   * @param[in,out] S          points to an instance of the Q15 FIR filter structure.
+   * @param[in]     numTaps    Number of filter coefficients in the filter. Must be even and greater than or equal to 4.
+   * @param[in]     pCoeffs    points to the filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     blockSize  number of samples that are processed at a time.
+   * @return     The function returns either
+   * <code>ARM_MATH_SUCCESS</code> if initialization was successful or
+   * <code>ARM_MATH_ARGUMENT_ERROR</code> if <code>numTaps</code> is not a supported value.
+   */
+  arm_status arm_fir_init_q15(
+        arm_fir_instance_q15 * S,
+        uint16_t numTaps,
+  const q15_t * pCoeffs,
+        q15_t * pState,
+        uint32_t blockSize);
+
+  /**
+   * @brief Processing function for the Q31 FIR filter.
+   * @param[in]  S          points to an instance of the Q31 FIR filter structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_fir_q31(
+  const arm_fir_instance_q31 * S,
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+  /**
+   * @brief Processing function for the fast Q31 FIR filter (fast version).
+   * @param[in]  S          points to an instance of the Q31 FIR filter structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_fir_fast_q31(
+  const arm_fir_instance_q31 * S,
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+  /**
+   * @brief  Initialization function for the Q31 FIR filter.
+   * @param[in,out] S          points to an instance of the Q31 FIR structure.
+   * @param[in]     numTaps    Number of filter coefficients in the filter.
+   * @param[in]     pCoeffs    points to the filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     blockSize  number of samples that are processed at a time.
+   */
+  void arm_fir_init_q31(
+        arm_fir_instance_q31 * S,
+        uint16_t numTaps,
+  const q31_t * pCoeffs,
+        q31_t * pState,
+        uint32_t blockSize);
+
+  /**
+   * @brief Processing function for the floating-point FIR filter.
+   * @param[in]  S          points to an instance of the floating-point FIR structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_fir_f32(
+  const arm_fir_instance_f32 * S,
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+  /**
+   * @brief  Initialization function for the floating-point FIR filter.
+   * @param[in,out] S          points to an instance of the floating-point FIR filter structure.
+   * @param[in]     numTaps    Number of filter coefficients in the filter.
+   * @param[in]     pCoeffs    points to the filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     blockSize  number of samples that are processed at a time.
+   */
+  void arm_fir_init_f32(
+        arm_fir_instance_f32 * S,
+        uint16_t numTaps,
+  const float32_t * pCoeffs,
+        float32_t * pState,
+        uint32_t blockSize);
+
+  /**
+   * @brief Instance structure for the Q15 Biquad cascade filter.
+   */
+  typedef struct
+  {
+          int8_t numStages;        /**< number of 2nd order stages in the filter.  Overall order is 2*numStages. */
+          q15_t *pState;           /**< Points to the array of state coefficients.  The array is of length 4*numStages. */
+    const q15_t *pCoeffs;          /**< Points to the array of coefficients.  The array is of length 5*numStages. */
+          int8_t postShift;        /**< Additional shift, in bits, applied to each output sample. */
+  } arm_biquad_casd_df1_inst_q15;
+
+  /**
+   * @brief Instance structure for the Q31 Biquad cascade filter.
+   */
+  typedef struct
+  {
+          uint32_t numStages;      /**< number of 2nd order stages in the filter.  Overall order is 2*numStages. */
+          q31_t *pState;           /**< Points to the array of state coefficients.  The array is of length 4*numStages. */
+    const q31_t *pCoeffs;          /**< Points to the array of coefficients.  The array is of length 5*numStages. */
+          uint8_t postShift;       /**< Additional shift, in bits, applied to each output sample. */
+  } arm_biquad_casd_df1_inst_q31;
+
+  /**
+   * @brief Instance structure for the floating-point Biquad cascade filter.
+   */
+  typedef struct
+  {
+          uint32_t numStages;      /**< number of 2nd order stages in the filter.  Overall order is 2*numStages. */
+          float32_t *pState;       /**< Points to the array of state coefficients.  The array is of length 4*numStages. */
+    const float32_t *pCoeffs;      /**< Points to the array of coefficients.  The array is of length 5*numStages. */
+  } arm_biquad_casd_df1_inst_f32;
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+  /**
+   * @brief Instance structure for the modified Biquad coefs required by vectorized code.
+   */
+  typedef struct
+  {
+      float32_t coeffs[8][4]; /**< Points to the array of modified coefficients.  The array is of length 32. There is one per stage */
+  } arm_biquad_mod_coef_f32;
+#endif
+
+  /**
+   * @brief Processing function for the Q15 Biquad cascade filter.
+   * @param[in]  S          points to an instance of the Q15 Biquad cascade structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_biquad_cascade_df1_q15(
+  const arm_biquad_casd_df1_inst_q15 * S,
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+  /**
+   * @brief  Initialization function for the Q15 Biquad cascade filter.
+   * @param[in,out] S          points to an instance of the Q15 Biquad cascade structure.
+   * @param[in]     numStages  number of 2nd order stages in the filter.
+   * @param[in]     pCoeffs    points to the filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     postShift  Shift to be applied to the output. Varies according to the coefficients format
+   */
+  void arm_biquad_cascade_df1_init_q15(
+        arm_biquad_casd_df1_inst_q15 * S,
+        uint8_t numStages,
+  const q15_t * pCoeffs,
+        q15_t * pState,
+        int8_t postShift);
+
+  /**
+   * @brief Fast but less precise processing function for the Q15 Biquad cascade filter for Cortex-M3 and Cortex-M4.
+   * @param[in]  S          points to an instance of the Q15 Biquad cascade structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_biquad_cascade_df1_fast_q15(
+  const arm_biquad_casd_df1_inst_q15 * S,
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+  /**
+   * @brief Processing function for the Q31 Biquad cascade filter
+   * @param[in]  S          points to an instance of the Q31 Biquad cascade structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_biquad_cascade_df1_q31(
+  const arm_biquad_casd_df1_inst_q31 * S,
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+  /**
+   * @brief Fast but less precise processing function for the Q31 Biquad cascade filter for Cortex-M3 and Cortex-M4.
+   * @param[in]  S          points to an instance of the Q31 Biquad cascade structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_biquad_cascade_df1_fast_q31(
+  const arm_biquad_casd_df1_inst_q31 * S,
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+  /**
+   * @brief  Initialization function for the Q31 Biquad cascade filter.
+   * @param[in,out] S          points to an instance of the Q31 Biquad cascade structure.
+   * @param[in]     numStages  number of 2nd order stages in the filter.
+   * @param[in]     pCoeffs    points to the filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     postShift  Shift to be applied to the output. Varies according to the coefficients format
+   */
+  void arm_biquad_cascade_df1_init_q31(
+        arm_biquad_casd_df1_inst_q31 * S,
+        uint8_t numStages,
+  const q31_t * pCoeffs,
+        q31_t * pState,
+        int8_t postShift);
+
+  /**
+   * @brief Processing function for the floating-point Biquad cascade filter.
+   * @param[in]  S          points to an instance of the floating-point Biquad cascade structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_biquad_cascade_df1_f32(
+  const arm_biquad_casd_df1_inst_f32 * S,
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+  /**
+   * @brief  Initialization function for the floating-point Biquad cascade filter.
+   * @param[in,out] S          points to an instance of the floating-point Biquad cascade structure.
+   * @param[in]     numStages  number of 2nd order stages in the filter.
+   * @param[in]     pCoeffs    points to the filter coefficients.
+   * @param[in]     pCoeffsMod points to the modified filter coefficients (only MVE version).
+   * @param[in]     pState     points to the state buffer.
+   */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+  void arm_biquad_cascade_df1_mve_init_f32(
+      arm_biquad_casd_df1_inst_f32 * S,
+      uint8_t numStages,
+      const float32_t * pCoeffs,
+      arm_biquad_mod_coef_f32 * pCoeffsMod,
+      float32_t * pState);
+#endif
+
+  void arm_biquad_cascade_df1_init_f32(
+        arm_biquad_casd_df1_inst_f32 * S,
+        uint8_t numStages,
+  const float32_t * pCoeffs,
+        float32_t * pState);
+
+
+  /**
+   * @brief         Compute the logical bitwise AND of two fixed-point vectors.
+   * @param[in]     pSrcA      points to input vector A
+   * @param[in]     pSrcB      points to input vector B
+   * @param[out]    pDst       points to output vector
+   * @param[in]     blockSize  number of samples in each vector
+   * @return        none
+   */
+  void arm_and_u16(
+    const uint16_t * pSrcA,
+    const uint16_t * pSrcB,
+          uint16_t * pDst,
+          uint32_t blockSize);
+
+  /**
+   * @brief         Compute the logical bitwise AND of two fixed-point vectors.
+   * @param[in]     pSrcA      points to input vector A
+   * @param[in]     pSrcB      points to input vector B
+   * @param[out]    pDst       points to output vector
+   * @param[in]     blockSize  number of samples in each vector
+   * @return        none
+   */
+  void arm_and_u32(
+    const uint32_t * pSrcA,
+    const uint32_t * pSrcB,
+          uint32_t * pDst,
+          uint32_t blockSize);
+
+  /**
+   * @brief         Compute the logical bitwise AND of two fixed-point vectors.
+   * @param[in]     pSrcA      points to input vector A
+   * @param[in]     pSrcB      points to input vector B
+   * @param[out]    pDst       points to output vector
+   * @param[in]     blockSize  number of samples in each vector
+   * @return        none
+   */
+  void arm_and_u8(
+    const uint8_t * pSrcA,
+    const uint8_t * pSrcB,
+          uint8_t * pDst,
+          uint32_t blockSize);
+
+  /**
+   * @brief         Compute the logical bitwise OR of two fixed-point vectors.
+   * @param[in]     pSrcA      points to input vector A
+   * @param[in]     pSrcB      points to input vector B
+   * @param[out]    pDst       points to output vector
+   * @param[in]     blockSize  number of samples in each vector
+   * @return        none
+   */
+  void arm_or_u16(
+    const uint16_t * pSrcA,
+    const uint16_t * pSrcB,
+          uint16_t * pDst,
+          uint32_t blockSize);
+
+  /**
+   * @brief         Compute the logical bitwise OR of two fixed-point vectors.
+   * @param[in]     pSrcA      points to input vector A
+   * @param[in]     pSrcB      points to input vector B
+   * @param[out]    pDst       points to output vector
+   * @param[in]     blockSize  number of samples in each vector
+   * @return        none
+   */
+  void arm_or_u32(
+    const uint32_t * pSrcA,
+    const uint32_t * pSrcB,
+          uint32_t * pDst,
+          uint32_t blockSize);
+
+  /**
+   * @brief         Compute the logical bitwise OR of two fixed-point vectors.
+   * @param[in]     pSrcA      points to input vector A
+   * @param[in]     pSrcB      points to input vector B
+   * @param[out]    pDst       points to output vector
+   * @param[in]     blockSize  number of samples in each vector
+   * @return        none
+   */
+  void arm_or_u8(
+    const uint8_t * pSrcA,
+    const uint8_t * pSrcB,
+          uint8_t * pDst,
+          uint32_t blockSize);
+
+  /**
+   * @brief         Compute the logical bitwise NOT of a fixed-point vector.
+   * @param[in]     pSrc       points to input vector
+   * @param[out]    pDst       points to output vector
+   * @param[in]     blockSize  number of samples in each vector
+   * @return        none
+   */
+  void arm_not_u16(
+    const uint16_t * pSrc,
+          uint16_t * pDst,
+          uint32_t blockSize);
+
+  /**
+   * @brief         Compute the logical bitwise NOT of a fixed-point vector.
+   * @param[in]     pSrc       points to input vector
+   * @param[out]    pDst       points to output vector
+   * @param[in]     blockSize  number of samples in each vector
+   * @return        none
+   */
+  void arm_not_u32(
+    const uint32_t * pSrc,
+          uint32_t * pDst,
+          uint32_t blockSize);
+
+  /**
+   * @brief         Compute the logical bitwise NOT of a fixed-point vector.
+   * @param[in]     pSrc       points to input vector
+   * @param[out]    pDst       points to output vector
+   * @param[in]     blockSize  number of samples in each vector
+   * @return        none
+   */
+  void arm_not_u8(
+    const uint8_t * pSrc,
+          uint8_t * pDst,
+          uint32_t blockSize);
+
+/**
+   * @brief         Compute the logical bitwise XOR of two fixed-point vectors.
+   * @param[in]     pSrcA      points to input vector A
+   * @param[in]     pSrcB      points to input vector B
+   * @param[out]    pDst       points to output vector
+   * @param[in]     blockSize  number of samples in each vector
+   * @return        none
+   */
+  void arm_xor_u16(
+    const uint16_t * pSrcA,
+    const uint16_t * pSrcB,
+          uint16_t * pDst,
+          uint32_t blockSize);
+
+  /**
+   * @brief         Compute the logical bitwise XOR of two fixed-point vectors.
+   * @param[in]     pSrcA      points to input vector A
+   * @param[in]     pSrcB      points to input vector B
+   * @param[out]    pDst       points to output vector
+   * @param[in]     blockSize  number of samples in each vector
+   * @return        none
+   */
+  void arm_xor_u32(
+    const uint32_t * pSrcA,
+    const uint32_t * pSrcB,
+          uint32_t * pDst,
+          uint32_t blockSize);
+
+  /**
+   * @brief         Compute the logical bitwise XOR of two fixed-point vectors.
+   * @param[in]     pSrcA      points to input vector A
+   * @param[in]     pSrcB      points to input vector B
+   * @param[out]    pDst       points to output vector
+   * @param[in]     blockSize  number of samples in each vector
+   * @return        none
+   */
+  void arm_xor_u8(
+    const uint8_t * pSrcA,
+    const uint8_t * pSrcB,
+          uint8_t * pDst,
+    uint32_t blockSize);
+
+  /**
+   * @brief Struct for specifying sorting algorithm
+   */
+  typedef enum
+  {
+    ARM_SORT_BITONIC   = 0,
+             /**< Bitonic sort   */
+    ARM_SORT_BUBBLE    = 1,
+             /**< Bubble sort    */
+    ARM_SORT_HEAP      = 2,
+             /**< Heap sort      */
+    ARM_SORT_INSERTION = 3,
+             /**< Insertion sort */
+    ARM_SORT_QUICK     = 4,
+             /**< Quick sort     */
+    ARM_SORT_SELECTION = 5
+             /**< Selection sort */
+  } arm_sort_alg;
+
+  /**
+   * @brief Struct for specifying sorting algorithm
+   */
+  typedef enum
+  {
+    ARM_SORT_DESCENDING = 0,
+             /**< Descending order (9 to 0) */
+    ARM_SORT_ASCENDING = 1
+             /**< Ascending order (0 to 9) */
+  } arm_sort_dir;
+
+  /**
+   * @brief Instance structure for the sorting algorithms.
+   */
+  typedef struct
+  {
+    arm_sort_alg alg;        /**< Sorting algorithm selected */
+    arm_sort_dir dir;        /**< Sorting order (direction)  */
+  } arm_sort_instance_f32;
+
+  /**
+   * @param[in]  S          points to an instance of the sorting structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_sort_f32(
+    const arm_sort_instance_f32 * S,
+          float32_t * pSrc,
+          float32_t * pDst,
+          uint32_t blockSize);
+
+  /**
+   * @param[in,out]  S            points to an instance of the sorting structure.
+   * @param[in]      alg          Selected algorithm.
+   * @param[in]      dir          Sorting order.
+   */
+  void arm_sort_init_f32(
+    arm_sort_instance_f32 * S,
+    arm_sort_alg alg,
+    arm_sort_dir dir);
+
+  /**
+   * @brief Instance structure for the sorting algorithms.
+   */
+  typedef struct
+  {
+    arm_sort_dir dir;        /**< Sorting order (direction)  */
+    float32_t * buffer;      /**< Working buffer */
+  } arm_merge_sort_instance_f32;
+
+  /**
+   * @param[in]      S          points to an instance of the sorting structure.
+   * @param[in,out]  pSrc       points to the block of input data.
+   * @param[out]     pDst       points to the block of output data
+   * @param[in]      blockSize  number of samples to process.
+   */
+  void arm_merge_sort_f32(
+    const arm_merge_sort_instance_f32 * S,
+          float32_t *pSrc,
+          float32_t *pDst,
+          uint32_t blockSize);
+
+  /**
+   * @param[in,out]  S            points to an instance of the sorting structure.
+   * @param[in]      dir          Sorting order.
+   * @param[in]      buffer       Working buffer.
+   */
+  void arm_merge_sort_init_f32(
+    arm_merge_sort_instance_f32 * S,
+    arm_sort_dir dir,
+    float32_t * buffer);
+
+  /**
+   * @brief Struct for specifying cubic spline type
+   */
+  typedef enum
+  {
+    ARM_SPLINE_NATURAL = 0,           /**< Natural spline */
+    ARM_SPLINE_PARABOLIC_RUNOUT = 1   /**< Parabolic runout spline */
+  } arm_spline_type;
+
+  /**
+   * @brief Instance structure for the floating-point cubic spline interpolation.
+   */
+  typedef struct
+  {
+    arm_spline_type type;      /**< Type (boundary conditions) */
+    const float32_t * x;       /**< x values */
+    const float32_t * y;       /**< y values */
+    uint32_t n_x;              /**< Number of known data points */
+    float32_t * coeffs;        /**< Coefficients buffer (b,c, and d) */
+  } arm_spline_instance_f32;
+
+  /**
+   * @brief Processing function for the floating-point cubic spline interpolation.
+   * @param[in]  S          points to an instance of the floating-point spline structure.
+   * @param[in]  xq         points to the x values ot the interpolated data points.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples of output data.
+   */
+  void arm_spline_f32(
+        arm_spline_instance_f32 * S,
+  const float32_t * xq,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+  /**
+   * @brief Initialization function for the floating-point cubic spline interpolation.
+   * @param[in,out] S        points to an instance of the floating-point spline structure.
+   * @param[in]     type     type of cubic spline interpolation (boundary conditions)
+   * @param[in]     x        points to the x values of the known data points.
+   * @param[in]     y        points to the y values of the known data points.
+   * @param[in]     n        number of known data points.
+   * @param[in]     coeffs   coefficients array for b, c, and d
+   * @param[in]     tempBuffer   buffer array for internal computations
+   */
+  void arm_spline_init_f32(
+          arm_spline_instance_f32 * S,
+          arm_spline_type type,
+    const float32_t * x,
+    const float32_t * y,
+          uint32_t n,
+          float32_t * coeffs,
+          float32_t * tempBuffer);
+
+  /**
+   * @brief Instance structure for the floating-point matrix structure.
+   */
+  typedef struct
+  {
+    uint16_t numRows;     /**< number of rows of the matrix.     */
+    uint16_t numCols;     /**< number of columns of the matrix.  */
+    float32_t *pData;     /**< points to the data of the matrix. */
+  } arm_matrix_instance_f32;
+
+ /**
+   * @brief Instance structure for the floating-point matrix structure.
+   */
+  typedef struct
+  {
+    uint16_t numRows;     /**< number of rows of the matrix.     */
+    uint16_t numCols;     /**< number of columns of the matrix.  */
+    float64_t *pData;     /**< points to the data of the matrix. */
+  } arm_matrix_instance_f64;
+
+  /**
+   * @brief Instance structure for the Q15 matrix structure.
+   */
+  typedef struct
+  {
+    uint16_t numRows;     /**< number of rows of the matrix.     */
+    uint16_t numCols;     /**< number of columns of the matrix.  */
+    q15_t *pData;         /**< points to the data of the matrix. */
+  } arm_matrix_instance_q15;
+
+  /**
+   * @brief Instance structure for the Q31 matrix structure.
+   */
+  typedef struct
+  {
+    uint16_t numRows;     /**< number of rows of the matrix.     */
+    uint16_t numCols;     /**< number of columns of the matrix.  */
+    q31_t *pData;         /**< points to the data of the matrix. */
+  } arm_matrix_instance_q31;
+
+  /**
+   * @brief Floating-point matrix addition.
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_add_f32(
+  const arm_matrix_instance_f32 * pSrcA,
+  const arm_matrix_instance_f32 * pSrcB,
+        arm_matrix_instance_f32 * pDst);
+
+  /**
+   * @brief Q15 matrix addition.
+   * @param[in]   pSrcA  points to the first input matrix structure
+   * @param[in]   pSrcB  points to the second input matrix structure
+   * @param[out]  pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_add_q15(
+  const arm_matrix_instance_q15 * pSrcA,
+  const arm_matrix_instance_q15 * pSrcB,
+        arm_matrix_instance_q15 * pDst);
+
+  /**
+   * @brief Q31 matrix addition.
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_add_q31(
+  const arm_matrix_instance_q31 * pSrcA,
+  const arm_matrix_instance_q31 * pSrcB,
+        arm_matrix_instance_q31 * pDst);
+
+  /**
+   * @brief Floating-point, complex, matrix multiplication.
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_cmplx_mult_f32(
+  const arm_matrix_instance_f32 * pSrcA,
+  const arm_matrix_instance_f32 * pSrcB,
+        arm_matrix_instance_f32 * pDst);
+
+  /**
+   * @brief Q15, complex,  matrix multiplication.
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_cmplx_mult_q15(
+  const arm_matrix_instance_q15 * pSrcA,
+  const arm_matrix_instance_q15 * pSrcB,
+        arm_matrix_instance_q15 * pDst,
+        q15_t * pScratch);
+
+  /**
+   * @brief Q31, complex, matrix multiplication.
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_cmplx_mult_q31(
+  const arm_matrix_instance_q31 * pSrcA,
+  const arm_matrix_instance_q31 * pSrcB,
+        arm_matrix_instance_q31 * pDst);
+
+  /**
+   * @brief Floating-point matrix transpose.
+   * @param[in]  pSrc  points to the input matrix
+   * @param[out] pDst  points to the output matrix
+   * @return    The function returns either  <code>ARM_MATH_SIZE_MISMATCH</code>
+   * or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_trans_f32(
+  const arm_matrix_instance_f32 * pSrc,
+        arm_matrix_instance_f32 * pDst);
+
+  /**
+   * @brief Q15 matrix transpose.
+   * @param[in]  pSrc  points to the input matrix
+   * @param[out] pDst  points to the output matrix
+   * @return    The function returns either  <code>ARM_MATH_SIZE_MISMATCH</code>
+   * or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_trans_q15(
+  const arm_matrix_instance_q15 * pSrc,
+        arm_matrix_instance_q15 * pDst);
+
+  /**
+   * @brief Q31 matrix transpose.
+   * @param[in]  pSrc  points to the input matrix
+   * @param[out] pDst  points to the output matrix
+   * @return    The function returns either  <code>ARM_MATH_SIZE_MISMATCH</code>
+   * or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_trans_q31(
+  const arm_matrix_instance_q31 * pSrc,
+        arm_matrix_instance_q31 * pDst);
+
+  /**
+   * @brief Floating-point matrix multiplication
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_mult_f32(
+  const arm_matrix_instance_f32 * pSrcA,
+  const arm_matrix_instance_f32 * pSrcB,
+        arm_matrix_instance_f32 * pDst);
+
+  /**
+   * @brief Q15 matrix multiplication
+   * @param[in]  pSrcA   points to the first input matrix structure
+   * @param[in]  pSrcB   points to the second input matrix structure
+   * @param[out] pDst    points to output matrix structure
+   * @param[in]  pState  points to the array for storing intermediate results
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_mult_q15(
+  const arm_matrix_instance_q15 * pSrcA,
+  const arm_matrix_instance_q15 * pSrcB,
+        arm_matrix_instance_q15 * pDst,
+        q15_t * pState);
+
+  /**
+   * @brief Q15 matrix multiplication (fast variant) for Cortex-M3 and Cortex-M4
+   * @param[in]  pSrcA   points to the first input matrix structure
+   * @param[in]  pSrcB   points to the second input matrix structure
+   * @param[out] pDst    points to output matrix structure
+   * @param[in]  pState  points to the array for storing intermediate results
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_mult_fast_q15(
+  const arm_matrix_instance_q15 * pSrcA,
+  const arm_matrix_instance_q15 * pSrcB,
+        arm_matrix_instance_q15 * pDst,
+        q15_t * pState);
+
+  /**
+   * @brief Q31 matrix multiplication
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_mult_q31(
+  const arm_matrix_instance_q31 * pSrcA,
+  const arm_matrix_instance_q31 * pSrcB,
+        arm_matrix_instance_q31 * pDst);
+
+  /**
+   * @brief Q31 matrix multiplication (fast variant) for Cortex-M3 and Cortex-M4
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_mult_fast_q31(
+  const arm_matrix_instance_q31 * pSrcA,
+  const arm_matrix_instance_q31 * pSrcB,
+        arm_matrix_instance_q31 * pDst);
+
+  /**
+   * @brief Floating-point matrix subtraction
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_sub_f32(
+  const arm_matrix_instance_f32 * pSrcA,
+  const arm_matrix_instance_f32 * pSrcB,
+        arm_matrix_instance_f32 * pDst);
+
+  /**
+   * @brief Q15 matrix subtraction
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_sub_q15(
+  const arm_matrix_instance_q15 * pSrcA,
+  const arm_matrix_instance_q15 * pSrcB,
+        arm_matrix_instance_q15 * pDst);
+
+  /**
+   * @brief Q31 matrix subtraction
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_sub_q31(
+  const arm_matrix_instance_q31 * pSrcA,
+  const arm_matrix_instance_q31 * pSrcB,
+        arm_matrix_instance_q31 * pDst);
+
+  /**
+   * @brief Floating-point matrix scaling.
+   * @param[in]  pSrc   points to the input matrix
+   * @param[in]  scale  scale factor
+   * @param[out] pDst   points to the output matrix
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_scale_f32(
+  const arm_matrix_instance_f32 * pSrc,
+        float32_t scale,
+        arm_matrix_instance_f32 * pDst);
+
+  /**
+   * @brief Q15 matrix scaling.
+   * @param[in]  pSrc        points to input matrix
+   * @param[in]  scaleFract  fractional portion of the scale factor
+   * @param[in]  shift       number of bits to shift the result by
+   * @param[out] pDst        points to output matrix
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_scale_q15(
+  const arm_matrix_instance_q15 * pSrc,
+        q15_t scaleFract,
+        int32_t shift,
+        arm_matrix_instance_q15 * pDst);
+
+  /**
+   * @brief Q31 matrix scaling.
+   * @param[in]  pSrc        points to input matrix
+   * @param[in]  scaleFract  fractional portion of the scale factor
+   * @param[in]  shift       number of bits to shift the result by
+   * @param[out] pDst        points to output matrix structure
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_scale_q31(
+  const arm_matrix_instance_q31 * pSrc,
+        q31_t scaleFract,
+        int32_t shift,
+        arm_matrix_instance_q31 * pDst);
+
+  /**
+   * @brief  Q31 matrix initialization.
+   * @param[in,out] S         points to an instance of the floating-point matrix structure.
+   * @param[in]     nRows     number of rows in the matrix.
+   * @param[in]     nColumns  number of columns in the matrix.
+   * @param[in]     pData     points to the matrix data array.
+   */
+void arm_mat_init_q31(
+        arm_matrix_instance_q31 * S,
+        uint16_t nRows,
+        uint16_t nColumns,
+        q31_t * pData);
+
+  /**
+   * @brief  Q15 matrix initialization.
+   * @param[in,out] S         points to an instance of the floating-point matrix structure.
+   * @param[in]     nRows     number of rows in the matrix.
+   * @param[in]     nColumns  number of columns in the matrix.
+   * @param[in]     pData     points to the matrix data array.
+   */
+void arm_mat_init_q15(
+        arm_matrix_instance_q15 * S,
+        uint16_t nRows,
+        uint16_t nColumns,
+        q15_t * pData);
+
+  /**
+   * @brief  Floating-point matrix initialization.
+   * @param[in,out] S         points to an instance of the floating-point matrix structure.
+   * @param[in]     nRows     number of rows in the matrix.
+   * @param[in]     nColumns  number of columns in the matrix.
+   * @param[in]     pData     points to the matrix data array.
+   */
+void arm_mat_init_f32(
+        arm_matrix_instance_f32 * S,
+        uint16_t nRows,
+        uint16_t nColumns,
+        float32_t * pData);
+
+
+  /**
+   * @brief Instance structure for the Q15 PID Control.
+   */
+  typedef struct
+  {
+          q15_t A0;           /**< The derived gain, A0 = Kp + Ki + Kd . */
+#if !defined (ARM_MATH_DSP)
+          q15_t A1;
+          q15_t A2;
+#else
+          q31_t A1;           /**< The derived gain A1 = -Kp - 2Kd | Kd.*/
+#endif
+          q15_t state[3];     /**< The state array of length 3. */
+          q15_t Kp;           /**< The proportional gain. */
+          q15_t Ki;           /**< The integral gain. */
+          q15_t Kd;           /**< The derivative gain. */
+  } arm_pid_instance_q15;
+
+  /**
+   * @brief Instance structure for the Q31 PID Control.
+   */
+  typedef struct
+  {
+          q31_t A0;            /**< The derived gain, A0 = Kp + Ki + Kd . */
+          q31_t A1;            /**< The derived gain, A1 = -Kp - 2Kd. */
+          q31_t A2;            /**< The derived gain, A2 = Kd . */
+          q31_t state[3];      /**< The state array of length 3. */
+          q31_t Kp;            /**< The proportional gain. */
+          q31_t Ki;            /**< The integral gain. */
+          q31_t Kd;            /**< The derivative gain. */
+  } arm_pid_instance_q31;
+
+  /**
+   * @brief Instance structure for the floating-point PID Control.
+   */
+  typedef struct
+  {
+          float32_t A0;          /**< The derived gain, A0 = Kp + Ki + Kd . */
+          float32_t A1;          /**< The derived gain, A1 = -Kp - 2Kd. */
+          float32_t A2;          /**< The derived gain, A2 = Kd . */
+          float32_t state[3];    /**< The state array of length 3. */
+          float32_t Kp;          /**< The proportional gain. */
+          float32_t Ki;          /**< The integral gain. */
+          float32_t Kd;          /**< The derivative gain. */
+  } arm_pid_instance_f32;
+
+
+
+  /**
+   * @brief  Initialization function for the floating-point PID Control.
+   * @param[in,out] S               points to an instance of the PID structure.
+   * @param[in]     resetStateFlag  flag to reset the state. 0 = no change in state 1 = reset the state.
+   */
+  void arm_pid_init_f32(
+        arm_pid_instance_f32 * S,
+        int32_t resetStateFlag);
+
+
+  /**
+   * @brief  Reset function for the floating-point PID Control.
+   * @param[in,out] S  is an instance of the floating-point PID Control structure
+   */
+  void arm_pid_reset_f32(
+        arm_pid_instance_f32 * S);
+
+
+  /**
+   * @brief  Initialization function for the Q31 PID Control.
+   * @param[in,out] S               points to an instance of the Q15 PID structure.
+   * @param[in]     resetStateFlag  flag to reset the state. 0 = no change in state 1 = reset the state.
+   */
+  void arm_pid_init_q31(
+        arm_pid_instance_q31 * S,
+        int32_t resetStateFlag);
+
+
+  /**
+   * @brief  Reset function for the Q31 PID Control.
+   * @param[in,out] S   points to an instance of the Q31 PID Control structure
+   */
+
+  void arm_pid_reset_q31(
+        arm_pid_instance_q31 * S);
+
+
+  /**
+   * @brief  Initialization function for the Q15 PID Control.
+   * @param[in,out] S               points to an instance of the Q15 PID structure.
+   * @param[in]     resetStateFlag  flag to reset the state. 0 = no change in state 1 = reset the state.
+   */
+  void arm_pid_init_q15(
+        arm_pid_instance_q15 * S,
+        int32_t resetStateFlag);
+
+
+  /**
+   * @brief  Reset function for the Q15 PID Control.
+   * @param[in,out] S  points to an instance of the q15 PID Control structure
+   */
+  void arm_pid_reset_q15(
+        arm_pid_instance_q15 * S);
+
+
+  /**
+   * @brief Instance structure for the floating-point Linear Interpolate function.
+   */
+  typedef struct
+  {
+          uint32_t nValues;           /**< nValues */
+          float32_t x1;               /**< x1 */
+          float32_t xSpacing;         /**< xSpacing */
+          float32_t *pYData;          /**< pointer to the table of Y values */
+  } arm_linear_interp_instance_f32;
+
+  /**
+   * @brief Instance structure for the floating-point bilinear interpolation function.
+   */
+  typedef struct
+  {
+          uint16_t numRows;   /**< number of rows in the data table. */
+          uint16_t numCols;   /**< number of columns in the data table. */
+          float32_t *pData;   /**< points to the data table. */
+  } arm_bilinear_interp_instance_f32;
+
+   /**
+   * @brief Instance structure for the Q31 bilinear interpolation function.
+   */
+  typedef struct
+  {
+          uint16_t numRows;   /**< number of rows in the data table. */
+          uint16_t numCols;   /**< number of columns in the data table. */
+          q31_t *pData;       /**< points to the data table. */
+  } arm_bilinear_interp_instance_q31;
+
+   /**
+   * @brief Instance structure for the Q15 bilinear interpolation function.
+   */
+  typedef struct
+  {
+          uint16_t numRows;   /**< number of rows in the data table. */
+          uint16_t numCols;   /**< number of columns in the data table. */
+          q15_t *pData;       /**< points to the data table. */
+  } arm_bilinear_interp_instance_q15;
+
+   /**
+   * @brief Instance structure for the Q15 bilinear interpolation function.
+   */
+  typedef struct
+  {
+          uint16_t numRows;   /**< number of rows in the data table. */
+          uint16_t numCols;   /**< number of columns in the data table. */
+          q7_t *pData;        /**< points to the data table. */
+  } arm_bilinear_interp_instance_q7;
+
+
+  /**
+   * @brief Q7 vector multiplication.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void arm_mult_q7(
+  const q7_t * pSrcA,
+  const q7_t * pSrcB,
+        q7_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Q15 vector multiplication.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void arm_mult_q15(
+  const q15_t * pSrcA,
+  const q15_t * pSrcB,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Q31 vector multiplication.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void arm_mult_q31(
+  const q31_t * pSrcA,
+  const q31_t * pSrcB,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Floating-point vector multiplication.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void arm_mult_f32(
+  const float32_t * pSrcA,
+  const float32_t * pSrcB,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Instance structure for the Q15 CFFT/CIFFT function.
+   */
+  typedef struct
+  {
+          uint16_t fftLen;                 /**< length of the FFT. */
+          uint8_t ifftFlag;                /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */
+          uint8_t bitReverseFlag;          /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */
+    const q15_t *pTwiddle;                 /**< points to the Sin twiddle factor table. */
+    const uint16_t *pBitRevTable;          /**< points to the bit reversal table. */
+          uint16_t twidCoefModifier;       /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
+          uint16_t bitRevFactor;           /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */
+  } arm_cfft_radix2_instance_q15;
+
+/* Deprecated */
+  arm_status arm_cfft_radix2_init_q15(
+        arm_cfft_radix2_instance_q15 * S,
+        uint16_t fftLen,
+        uint8_t ifftFlag,
+        uint8_t bitReverseFlag);
+
+/* Deprecated */
+  void arm_cfft_radix2_q15(
+  const arm_cfft_radix2_instance_q15 * S,
+        q15_t * pSrc);
+
+
+  /**
+   * @brief Instance structure for the Q15 CFFT/CIFFT function.
+   */
+  typedef struct
+  {
+          uint16_t fftLen;                 /**< length of the FFT. */
+          uint8_t ifftFlag;                /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */
+          uint8_t bitReverseFlag;          /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */
+    const q15_t *pTwiddle;                 /**< points to the twiddle factor table. */
+    const uint16_t *pBitRevTable;          /**< points to the bit reversal table. */
+          uint16_t twidCoefModifier;       /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
+          uint16_t bitRevFactor;           /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */
+  } arm_cfft_radix4_instance_q15;
+
+/* Deprecated */
+  arm_status arm_cfft_radix4_init_q15(
+        arm_cfft_radix4_instance_q15 * S,
+        uint16_t fftLen,
+        uint8_t ifftFlag,
+        uint8_t bitReverseFlag);
+
+/* Deprecated */
+  void arm_cfft_radix4_q15(
+  const arm_cfft_radix4_instance_q15 * S,
+        q15_t * pSrc);
+
+  /**
+   * @brief Instance structure for the Radix-2 Q31 CFFT/CIFFT function.
+   */
+  typedef struct
+  {
+          uint16_t fftLen;                 /**< length of the FFT. */
+          uint8_t ifftFlag;                /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */
+          uint8_t bitReverseFlag;          /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */
+    const q31_t *pTwiddle;                 /**< points to the Twiddle factor table. */
+    const uint16_t *pBitRevTable;          /**< points to the bit reversal table. */
+          uint16_t twidCoefModifier;       /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
+          uint16_t bitRevFactor;           /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */
+  } arm_cfft_radix2_instance_q31;
+
+/* Deprecated */
+  arm_status arm_cfft_radix2_init_q31(
+        arm_cfft_radix2_instance_q31 * S,
+        uint16_t fftLen,
+        uint8_t ifftFlag,
+        uint8_t bitReverseFlag);
+
+/* Deprecated */
+  void arm_cfft_radix2_q31(
+  const arm_cfft_radix2_instance_q31 * S,
+        q31_t * pSrc);
+
+  /**
+   * @brief Instance structure for the Q31 CFFT/CIFFT function.
+   */
+  typedef struct
+  {
+          uint16_t fftLen;                 /**< length of the FFT. */
+          uint8_t ifftFlag;                /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */
+          uint8_t bitReverseFlag;          /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */
+    const q31_t *pTwiddle;                 /**< points to the twiddle factor table. */
+    const uint16_t *pBitRevTable;          /**< points to the bit reversal table. */
+          uint16_t twidCoefModifier;       /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
+          uint16_t bitRevFactor;           /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */
+  } arm_cfft_radix4_instance_q31;
+
+/* Deprecated */
+  void arm_cfft_radix4_q31(
+  const arm_cfft_radix4_instance_q31 * S,
+        q31_t * pSrc);
+
+/* Deprecated */
+  arm_status arm_cfft_radix4_init_q31(
+        arm_cfft_radix4_instance_q31 * S,
+        uint16_t fftLen,
+        uint8_t ifftFlag,
+        uint8_t bitReverseFlag);
+
+  /**
+   * @brief Instance structure for the floating-point CFFT/CIFFT function.
+   */
+  typedef struct
+  {
+          uint16_t fftLen;                   /**< length of the FFT. */
+          uint8_t ifftFlag;                  /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */
+          uint8_t bitReverseFlag;            /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */
+    const float32_t *pTwiddle;               /**< points to the Twiddle factor table. */
+    const uint16_t *pBitRevTable;            /**< points to the bit reversal table. */
+          uint16_t twidCoefModifier;         /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
+          uint16_t bitRevFactor;             /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */
+          float32_t onebyfftLen;             /**< value of 1/fftLen. */
+  } arm_cfft_radix2_instance_f32;
+
+/* Deprecated */
+  arm_status arm_cfft_radix2_init_f32(
+        arm_cfft_radix2_instance_f32 * S,
+        uint16_t fftLen,
+        uint8_t ifftFlag,
+        uint8_t bitReverseFlag);
+
+/* Deprecated */
+  void arm_cfft_radix2_f32(
+  const arm_cfft_radix2_instance_f32 * S,
+        float32_t * pSrc);
+
+  /**
+   * @brief Instance structure for the floating-point CFFT/CIFFT function.
+   */
+  typedef struct
+  {
+          uint16_t fftLen;                   /**< length of the FFT. */
+          uint8_t ifftFlag;                  /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */
+          uint8_t bitReverseFlag;            /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */
+    const float32_t *pTwiddle;               /**< points to the Twiddle factor table. */
+    const uint16_t *pBitRevTable;            /**< points to the bit reversal table. */
+          uint16_t twidCoefModifier;         /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
+          uint16_t bitRevFactor;             /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */
+          float32_t onebyfftLen;             /**< value of 1/fftLen. */
+  } arm_cfft_radix4_instance_f32;
+
+/* Deprecated */
+  arm_status arm_cfft_radix4_init_f32(
+        arm_cfft_radix4_instance_f32 * S,
+        uint16_t fftLen,
+        uint8_t ifftFlag,
+        uint8_t bitReverseFlag);
+
+/* Deprecated */
+  void arm_cfft_radix4_f32(
+  const arm_cfft_radix4_instance_f32 * S,
+        float32_t * pSrc);
+
+  /**
+   * @brief Instance structure for the fixed-point CFFT/CIFFT function.
+   */
+  typedef struct
+  {
+          uint16_t fftLen;                   /**< length of the FFT. */
+    const q15_t *pTwiddle;             /**< points to the Twiddle factor table. */
+    const uint16_t *pBitRevTable;      /**< points to the bit reversal table. */
+          uint16_t bitRevLength;             /**< bit reversal table length. */
+#if defined(ARM_MATH_MVEI)
+   const uint32_t *rearranged_twiddle_tab_stride1_arr;        /**< Per stage reordered twiddle pointer (offset 1) */                                                       \
+   const uint32_t *rearranged_twiddle_tab_stride2_arr;        /**< Per stage reordered twiddle pointer (offset 2) */                                                       \
+   const uint32_t *rearranged_twiddle_tab_stride3_arr;        /**< Per stage reordered twiddle pointer (offset 3) */                                                       \
+   const q15_t *rearranged_twiddle_stride1; /**< reordered twiddle offset 1 storage */                                                                   \
+   const q15_t *rearranged_twiddle_stride2; /**< reordered twiddle offset 2 storage */                                                                   \
+   const q15_t *rearranged_twiddle_stride3;
+#endif
+  } arm_cfft_instance_q15;
+
+arm_status arm_cfft_init_q15(
+  arm_cfft_instance_q15 * S,
+  uint16_t fftLen);
+
+void arm_cfft_q15(
+    const arm_cfft_instance_q15 * S,
+          q15_t * p1,
+          uint8_t ifftFlag,
+          uint8_t bitReverseFlag);
+
+  /**
+   * @brief Instance structure for the fixed-point CFFT/CIFFT function.
+   */
+  typedef struct
+  {
+          uint16_t fftLen;                   /**< length of the FFT. */
+    const q31_t *pTwiddle;             /**< points to the Twiddle factor table. */
+    const uint16_t *pBitRevTable;      /**< points to the bit reversal table. */
+          uint16_t bitRevLength;             /**< bit reversal table length. */
+#if defined(ARM_MATH_MVEI)
+   const uint32_t *rearranged_twiddle_tab_stride1_arr;        /**< Per stage reordered twiddle pointer (offset 1) */                                                       \
+   const uint32_t *rearranged_twiddle_tab_stride2_arr;        /**< Per stage reordered twiddle pointer (offset 2) */                                                       \
+   const uint32_t *rearranged_twiddle_tab_stride3_arr;        /**< Per stage reordered twiddle pointer (offset 3) */                                                       \
+   const q31_t *rearranged_twiddle_stride1; /**< reordered twiddle offset 1 storage */                                                                   \
+   const q31_t *rearranged_twiddle_stride2; /**< reordered twiddle offset 2 storage */                                                                   \
+   const q31_t *rearranged_twiddle_stride3;
+#endif
+  } arm_cfft_instance_q31;
+
+arm_status arm_cfft_init_q31(
+  arm_cfft_instance_q31 * S,
+  uint16_t fftLen);
+
+void arm_cfft_q31(
+    const arm_cfft_instance_q31 * S,
+          q31_t * p1,
+          uint8_t ifftFlag,
+          uint8_t bitReverseFlag);
+
+  /**
+   * @brief Instance structure for the floating-point CFFT/CIFFT function.
+   */
+  typedef struct
+  {
+          uint16_t fftLen;                   /**< length of the FFT. */
+    const float32_t *pTwiddle;         /**< points to the Twiddle factor table. */
+    const uint16_t *pBitRevTable;      /**< points to the bit reversal table. */
+          uint16_t bitRevLength;             /**< bit reversal table length. */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+   const uint32_t *rearranged_twiddle_tab_stride1_arr;        /**< Per stage reordered twiddle pointer (offset 1) */                                                       \
+   const uint32_t *rearranged_twiddle_tab_stride2_arr;        /**< Per stage reordered twiddle pointer (offset 2) */                                                       \
+   const uint32_t *rearranged_twiddle_tab_stride3_arr;        /**< Per stage reordered twiddle pointer (offset 3) */                                                       \
+   const float32_t *rearranged_twiddle_stride1; /**< reordered twiddle offset 1 storage */                                                                   \
+   const float32_t *rearranged_twiddle_stride2; /**< reordered twiddle offset 2 storage */                                                                   \
+   const float32_t *rearranged_twiddle_stride3;
+#endif
+  } arm_cfft_instance_f32;
+
+
+  arm_status arm_cfft_init_f32(
+  arm_cfft_instance_f32 * S,
+  uint16_t fftLen);
+
+  void arm_cfft_f32(
+  const arm_cfft_instance_f32 * S,
+        float32_t * p1,
+        uint8_t ifftFlag,
+        uint8_t bitReverseFlag);
+
+
+  /**
+   * @brief Instance structure for the Double Precision Floating-point CFFT/CIFFT function.
+   */
+  typedef struct
+  {
+          uint16_t fftLen;                   /**< length of the FFT. */
+    const float64_t *pTwiddle;         /**< points to the Twiddle factor table. */
+    const uint16_t *pBitRevTable;      /**< points to the bit reversal table. */
+          uint16_t bitRevLength;             /**< bit reversal table length. */
+  } arm_cfft_instance_f64;
+
+  void arm_cfft_f64(
+  const arm_cfft_instance_f64 * S,
+        float64_t * p1,
+        uint8_t ifftFlag,
+        uint8_t bitReverseFlag);
+
+  /**
+   * @brief Instance structure for the Q15 RFFT/RIFFT function.
+   */
+  typedef struct
+  {
+          uint32_t fftLenReal;                      /**< length of the real FFT. */
+          uint8_t ifftFlagR;                        /**< flag that selects forward (ifftFlagR=0) or inverse (ifftFlagR=1) transform. */
+          uint8_t bitReverseFlagR;                  /**< flag that enables (bitReverseFlagR=1) or disables (bitReverseFlagR=0) bit reversal of output. */
+          uint32_t twidCoefRModifier;               /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
+    const q15_t *pTwiddleAReal;                     /**< points to the real twiddle factor table. */
+    const q15_t *pTwiddleBReal;                     /**< points to the imag twiddle factor table. */
+#if defined(ARM_MATH_MVEI)
+    arm_cfft_instance_q15 cfftInst;
+#else
+    const arm_cfft_instance_q15 *pCfft;       /**< points to the complex FFT instance. */
+#endif
+  } arm_rfft_instance_q15;
+
+  arm_status arm_rfft_init_q15(
+        arm_rfft_instance_q15 * S,
+        uint32_t fftLenReal,
+        uint32_t ifftFlagR,
+        uint32_t bitReverseFlag);
+
+  void arm_rfft_q15(
+  const arm_rfft_instance_q15 * S,
+        q15_t * pSrc,
+        q15_t * pDst);
+
+  /**
+   * @brief Instance structure for the Q31 RFFT/RIFFT function.
+   */
+  typedef struct
+  {
+          uint32_t fftLenReal;                        /**< length of the real FFT. */
+          uint8_t ifftFlagR;                          /**< flag that selects forward (ifftFlagR=0) or inverse (ifftFlagR=1) transform. */
+          uint8_t bitReverseFlagR;                    /**< flag that enables (bitReverseFlagR=1) or disables (bitReverseFlagR=0) bit reversal of output. */
+          uint32_t twidCoefRModifier;                 /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
+    const q31_t *pTwiddleAReal;                       /**< points to the real twiddle factor table. */
+    const q31_t *pTwiddleBReal;                       /**< points to the imag twiddle factor table. */
+#if defined(ARM_MATH_MVEI)
+    arm_cfft_instance_q31 cfftInst;
+#else
+    const arm_cfft_instance_q31 *pCfft;         /**< points to the complex FFT instance. */
+#endif
+  } arm_rfft_instance_q31;
+
+  arm_status arm_rfft_init_q31(
+        arm_rfft_instance_q31 * S,
+        uint32_t fftLenReal,
+        uint32_t ifftFlagR,
+        uint32_t bitReverseFlag);
+
+  void arm_rfft_q31(
+  const arm_rfft_instance_q31 * S,
+        q31_t * pSrc,
+        q31_t * pDst);
+
+  /**
+   * @brief Instance structure for the floating-point RFFT/RIFFT function.
+   */
+  typedef struct
+  {
+          uint32_t fftLenReal;                        /**< length of the real FFT. */
+          uint16_t fftLenBy2;                         /**< length of the complex FFT. */
+          uint8_t ifftFlagR;                          /**< flag that selects forward (ifftFlagR=0) or inverse (ifftFlagR=1) transform. */
+          uint8_t bitReverseFlagR;                    /**< flag that enables (bitReverseFlagR=1) or disables (bitReverseFlagR=0) bit reversal of output. */
+          uint32_t twidCoefRModifier;                     /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
+    const float32_t *pTwiddleAReal;                   /**< points to the real twiddle factor table. */
+    const float32_t *pTwiddleBReal;                   /**< points to the imag twiddle factor table. */
+          arm_cfft_radix4_instance_f32 *pCfft;        /**< points to the complex FFT instance. */
+  } arm_rfft_instance_f32;
+
+  arm_status arm_rfft_init_f32(
+        arm_rfft_instance_f32 * S,
+        arm_cfft_radix4_instance_f32 * S_CFFT,
+        uint32_t fftLenReal,
+        uint32_t ifftFlagR,
+        uint32_t bitReverseFlag);
+
+  void arm_rfft_f32(
+  const arm_rfft_instance_f32 * S,
+        float32_t * pSrc,
+        float32_t * pDst);
+
+  /**
+   * @brief Instance structure for the Double Precision Floating-point RFFT/RIFFT function.
+   */
+typedef struct
+  {
+          arm_cfft_instance_f64 Sint;      /**< Internal CFFT structure. */
+          uint16_t fftLenRFFT;             /**< length of the real sequence */
+    const float64_t * pTwiddleRFFT;        /**< Twiddle factors real stage  */
+  } arm_rfft_fast_instance_f64 ;
+
+arm_status arm_rfft_fast_init_f64 (
+         arm_rfft_fast_instance_f64 * S,
+         uint16_t fftLen);
+
+
+void arm_rfft_fast_f64(
+    arm_rfft_fast_instance_f64 * S,
+    float64_t * p, float64_t * pOut,
+    uint8_t ifftFlag);
+
+
+  /**
+   * @brief Instance structure for the floating-point RFFT/RIFFT function.
+   */
+typedef struct
+  {
+          arm_cfft_instance_f32 Sint;      /**< Internal CFFT structure. */
+          uint16_t fftLenRFFT;             /**< length of the real sequence */
+    const float32_t * pTwiddleRFFT;        /**< Twiddle factors real stage  */
+  } arm_rfft_fast_instance_f32 ;
+
+arm_status arm_rfft_fast_init_f32 (
+         arm_rfft_fast_instance_f32 * S,
+         uint16_t fftLen);
+
+
+  void arm_rfft_fast_f32(
+        const arm_rfft_fast_instance_f32 * S,
+        float32_t * p, float32_t * pOut,
+        uint8_t ifftFlag);
+
+  /**
+   * @brief Instance structure for the floating-point DCT4/IDCT4 function.
+   */
+  typedef struct
+  {
+          uint16_t N;                          /**< length of the DCT4. */
+          uint16_t Nby2;                       /**< half of the length of the DCT4. */
+          float32_t normalize;                 /**< normalizing factor. */
+    const float32_t *pTwiddle;                 /**< points to the twiddle factor table. */
+    const float32_t *pCosFactor;               /**< points to the cosFactor table. */
+          arm_rfft_instance_f32 *pRfft;        /**< points to the real FFT instance. */
+          arm_cfft_radix4_instance_f32 *pCfft; /**< points to the complex FFT instance. */
+  } arm_dct4_instance_f32;
+
+
+  /**
+   * @brief  Initialization function for the floating-point DCT4/IDCT4.
+   * @param[in,out] S          points to an instance of floating-point DCT4/IDCT4 structure.
+   * @param[in]     S_RFFT     points to an instance of floating-point RFFT/RIFFT structure.
+   * @param[in]     S_CFFT     points to an instance of floating-point CFFT/CIFFT structure.
+   * @param[in]     N          length of the DCT4.
+   * @param[in]     Nby2       half of the length of the DCT4.
+   * @param[in]     normalize  normalizing factor.
+   * @return      arm_status function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_ARGUMENT_ERROR if <code>fftLenReal</code> is not a supported transform length.
+   */
+  arm_status arm_dct4_init_f32(
+        arm_dct4_instance_f32 * S,
+        arm_rfft_instance_f32 * S_RFFT,
+        arm_cfft_radix4_instance_f32 * S_CFFT,
+        uint16_t N,
+        uint16_t Nby2,
+        float32_t normalize);
+
+
+  /**
+   * @brief Processing function for the floating-point DCT4/IDCT4.
+   * @param[in]     S              points to an instance of the floating-point DCT4/IDCT4 structure.
+   * @param[in]     pState         points to state buffer.
+   * @param[in,out] pInlineBuffer  points to the in-place input and output buffer.
+   */
+  void arm_dct4_f32(
+  const arm_dct4_instance_f32 * S,
+        float32_t * pState,
+        float32_t * pInlineBuffer);
+
+
+  /**
+   * @brief Instance structure for the Q31 DCT4/IDCT4 function.
+   */
+  typedef struct
+  {
+          uint16_t N;                          /**< length of the DCT4. */
+          uint16_t Nby2;                       /**< half of the length of the DCT4. */
+          q31_t normalize;                     /**< normalizing factor. */
+    const q31_t *pTwiddle;                     /**< points to the twiddle factor table. */
+    const q31_t *pCosFactor;                   /**< points to the cosFactor table. */
+          arm_rfft_instance_q31 *pRfft;        /**< points to the real FFT instance. */
+          arm_cfft_radix4_instance_q31 *pCfft; /**< points to the complex FFT instance. */
+  } arm_dct4_instance_q31;
+
+
+  /**
+   * @brief  Initialization function for the Q31 DCT4/IDCT4.
+   * @param[in,out] S          points to an instance of Q31 DCT4/IDCT4 structure.
+   * @param[in]     S_RFFT     points to an instance of Q31 RFFT/RIFFT structure
+   * @param[in]     S_CFFT     points to an instance of Q31 CFFT/CIFFT structure
+   * @param[in]     N          length of the DCT4.
+   * @param[in]     Nby2       half of the length of the DCT4.
+   * @param[in]     normalize  normalizing factor.
+   * @return      arm_status function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_ARGUMENT_ERROR if <code>N</code> is not a supported transform length.
+   */
+  arm_status arm_dct4_init_q31(
+        arm_dct4_instance_q31 * S,
+        arm_rfft_instance_q31 * S_RFFT,
+        arm_cfft_radix4_instance_q31 * S_CFFT,
+        uint16_t N,
+        uint16_t Nby2,
+        q31_t normalize);
+
+
+  /**
+   * @brief Processing function for the Q31 DCT4/IDCT4.
+   * @param[in]     S              points to an instance of the Q31 DCT4 structure.
+   * @param[in]     pState         points to state buffer.
+   * @param[in,out] pInlineBuffer  points to the in-place input and output buffer.
+   */
+  void arm_dct4_q31(
+  const arm_dct4_instance_q31 * S,
+        q31_t * pState,
+        q31_t * pInlineBuffer);
+
+
+  /**
+   * @brief Instance structure for the Q15 DCT4/IDCT4 function.
+   */
+  typedef struct
+  {
+          uint16_t N;                          /**< length of the DCT4. */
+          uint16_t Nby2;                       /**< half of the length of the DCT4. */
+          q15_t normalize;                     /**< normalizing factor. */
+    const q15_t *pTwiddle;                     /**< points to the twiddle factor table. */
+    const q15_t *pCosFactor;                   /**< points to the cosFactor table. */
+          arm_rfft_instance_q15 *pRfft;        /**< points to the real FFT instance. */
+          arm_cfft_radix4_instance_q15 *pCfft; /**< points to the complex FFT instance. */
+  } arm_dct4_instance_q15;
+
+
+  /**
+   * @brief  Initialization function for the Q15 DCT4/IDCT4.
+   * @param[in,out] S          points to an instance of Q15 DCT4/IDCT4 structure.
+   * @param[in]     S_RFFT     points to an instance of Q15 RFFT/RIFFT structure.
+   * @param[in]     S_CFFT     points to an instance of Q15 CFFT/CIFFT structure.
+   * @param[in]     N          length of the DCT4.
+   * @param[in]     Nby2       half of the length of the DCT4.
+   * @param[in]     normalize  normalizing factor.
+   * @return      arm_status function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_ARGUMENT_ERROR if <code>N</code> is not a supported transform length.
+   */
+  arm_status arm_dct4_init_q15(
+        arm_dct4_instance_q15 * S,
+        arm_rfft_instance_q15 * S_RFFT,
+        arm_cfft_radix4_instance_q15 * S_CFFT,
+        uint16_t N,
+        uint16_t Nby2,
+        q15_t normalize);
+
+
+  /**
+   * @brief Processing function for the Q15 DCT4/IDCT4.
+   * @param[in]     S              points to an instance of the Q15 DCT4 structure.
+   * @param[in]     pState         points to state buffer.
+   * @param[in,out] pInlineBuffer  points to the in-place input and output buffer.
+   */
+  void arm_dct4_q15(
+  const arm_dct4_instance_q15 * S,
+        q15_t * pState,
+        q15_t * pInlineBuffer);
+
+
+  /**
+   * @brief Floating-point vector addition.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void arm_add_f32(
+  const float32_t * pSrcA,
+  const float32_t * pSrcB,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Q7 vector addition.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void arm_add_q7(
+  const q7_t * pSrcA,
+  const q7_t * pSrcB,
+        q7_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Q15 vector addition.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void arm_add_q15(
+  const q15_t * pSrcA,
+  const q15_t * pSrcB,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Q31 vector addition.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void arm_add_q31(
+  const q31_t * pSrcA,
+  const q31_t * pSrcB,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Floating-point vector subtraction.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void arm_sub_f32(
+  const float32_t * pSrcA,
+  const float32_t * pSrcB,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Q7 vector subtraction.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void arm_sub_q7(
+  const q7_t * pSrcA,
+  const q7_t * pSrcB,
+        q7_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Q15 vector subtraction.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void arm_sub_q15(
+  const q15_t * pSrcA,
+  const q15_t * pSrcB,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Q31 vector subtraction.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void arm_sub_q31(
+  const q31_t * pSrcA,
+  const q31_t * pSrcB,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Multiplies a floating-point vector by a scalar.
+   * @param[in]  pSrc       points to the input vector
+   * @param[in]  scale      scale factor to be applied
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in the vector
+   */
+  void arm_scale_f32(
+  const float32_t * pSrc,
+        float32_t scale,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Multiplies a Q7 vector by a scalar.
+   * @param[in]  pSrc        points to the input vector
+   * @param[in]  scaleFract  fractional portion of the scale value
+   * @param[in]  shift       number of bits to shift the result by
+   * @param[out] pDst        points to the output vector
+   * @param[in]  blockSize   number of samples in the vector
+   */
+  void arm_scale_q7(
+  const q7_t * pSrc,
+        q7_t scaleFract,
+        int8_t shift,
+        q7_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Multiplies a Q15 vector by a scalar.
+   * @param[in]  pSrc        points to the input vector
+   * @param[in]  scaleFract  fractional portion of the scale value
+   * @param[in]  shift       number of bits to shift the result by
+   * @param[out] pDst        points to the output vector
+   * @param[in]  blockSize   number of samples in the vector
+   */
+  void arm_scale_q15(
+  const q15_t * pSrc,
+        q15_t scaleFract,
+        int8_t shift,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Multiplies a Q31 vector by a scalar.
+   * @param[in]  pSrc        points to the input vector
+   * @param[in]  scaleFract  fractional portion of the scale value
+   * @param[in]  shift       number of bits to shift the result by
+   * @param[out] pDst        points to the output vector
+   * @param[in]  blockSize   number of samples in the vector
+   */
+  void arm_scale_q31(
+  const q31_t * pSrc,
+        q31_t scaleFract,
+        int8_t shift,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Q7 vector absolute value.
+   * @param[in]  pSrc       points to the input buffer
+   * @param[out] pDst       points to the output buffer
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void arm_abs_q7(
+  const q7_t * pSrc,
+        q7_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Floating-point vector absolute value.
+   * @param[in]  pSrc       points to the input buffer
+   * @param[out] pDst       points to the output buffer
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void arm_abs_f32(
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Q15 vector absolute value.
+   * @param[in]  pSrc       points to the input buffer
+   * @param[out] pDst       points to the output buffer
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void arm_abs_q15(
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Q31 vector absolute value.
+   * @param[in]  pSrc       points to the input buffer
+   * @param[out] pDst       points to the output buffer
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void arm_abs_q31(
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Dot product of floating-point vectors.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[in]  blockSize  number of samples in each vector
+   * @param[out] result     output result returned here
+   */
+  void arm_dot_prod_f32(
+  const float32_t * pSrcA,
+  const float32_t * pSrcB,
+        uint32_t blockSize,
+        float32_t * result);
+
+
+  /**
+   * @brief Dot product of Q7 vectors.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[in]  blockSize  number of samples in each vector
+   * @param[out] result     output result returned here
+   */
+  void arm_dot_prod_q7(
+  const q7_t * pSrcA,
+  const q7_t * pSrcB,
+        uint32_t blockSize,
+        q31_t * result);
+
+
+  /**
+   * @brief Dot product of Q15 vectors.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[in]  blockSize  number of samples in each vector
+   * @param[out] result     output result returned here
+   */
+  void arm_dot_prod_q15(
+  const q15_t * pSrcA,
+  const q15_t * pSrcB,
+        uint32_t blockSize,
+        q63_t * result);
+
+
+  /**
+   * @brief Dot product of Q31 vectors.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[in]  blockSize  number of samples in each vector
+   * @param[out] result     output result returned here
+   */
+  void arm_dot_prod_q31(
+  const q31_t * pSrcA,
+  const q31_t * pSrcB,
+        uint32_t blockSize,
+        q63_t * result);
+
+
+  /**
+   * @brief  Shifts the elements of a Q7 vector a specified number of bits.
+   * @param[in]  pSrc       points to the input vector
+   * @param[in]  shiftBits  number of bits to shift.  A positive value shifts left; a negative value shifts right.
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in the vector
+   */
+  void arm_shift_q7(
+  const q7_t * pSrc,
+        int8_t shiftBits,
+        q7_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Shifts the elements of a Q15 vector a specified number of bits.
+   * @param[in]  pSrc       points to the input vector
+   * @param[in]  shiftBits  number of bits to shift.  A positive value shifts left; a negative value shifts right.
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in the vector
+   */
+  void arm_shift_q15(
+  const q15_t * pSrc,
+        int8_t shiftBits,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Shifts the elements of a Q31 vector a specified number of bits.
+   * @param[in]  pSrc       points to the input vector
+   * @param[in]  shiftBits  number of bits to shift.  A positive value shifts left; a negative value shifts right.
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in the vector
+   */
+  void arm_shift_q31(
+  const q31_t * pSrc,
+        int8_t shiftBits,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Adds a constant offset to a floating-point vector.
+   * @param[in]  pSrc       points to the input vector
+   * @param[in]  offset     is the offset to be added
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in the vector
+   */
+  void arm_offset_f32(
+  const float32_t * pSrc,
+        float32_t offset,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Adds a constant offset to a Q7 vector.
+   * @param[in]  pSrc       points to the input vector
+   * @param[in]  offset     is the offset to be added
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in the vector
+   */
+  void arm_offset_q7(
+  const q7_t * pSrc,
+        q7_t offset,
+        q7_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Adds a constant offset to a Q15 vector.
+   * @param[in]  pSrc       points to the input vector
+   * @param[in]  offset     is the offset to be added
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in the vector
+   */
+  void arm_offset_q15(
+  const q15_t * pSrc,
+        q15_t offset,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Adds a constant offset to a Q31 vector.
+   * @param[in]  pSrc       points to the input vector
+   * @param[in]  offset     is the offset to be added
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in the vector
+   */
+  void arm_offset_q31(
+  const q31_t * pSrc,
+        q31_t offset,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Negates the elements of a floating-point vector.
+   * @param[in]  pSrc       points to the input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in the vector
+   */
+  void arm_negate_f32(
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Negates the elements of a Q7 vector.
+   * @param[in]  pSrc       points to the input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in the vector
+   */
+  void arm_negate_q7(
+  const q7_t * pSrc,
+        q7_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Negates the elements of a Q15 vector.
+   * @param[in]  pSrc       points to the input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in the vector
+   */
+  void arm_negate_q15(
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Negates the elements of a Q31 vector.
+   * @param[in]  pSrc       points to the input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in the vector
+   */
+  void arm_negate_q31(
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Copies the elements of a floating-point vector.
+   * @param[in]  pSrc       input pointer
+   * @param[out] pDst       output pointer
+   * @param[in]  blockSize  number of samples to process
+   */
+  void arm_copy_f32(
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Copies the elements of a Q7 vector.
+   * @param[in]  pSrc       input pointer
+   * @param[out] pDst       output pointer
+   * @param[in]  blockSize  number of samples to process
+   */
+  void arm_copy_q7(
+  const q7_t * pSrc,
+        q7_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Copies the elements of a Q15 vector.
+   * @param[in]  pSrc       input pointer
+   * @param[out] pDst       output pointer
+   * @param[in]  blockSize  number of samples to process
+   */
+  void arm_copy_q15(
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Copies the elements of a Q31 vector.
+   * @param[in]  pSrc       input pointer
+   * @param[out] pDst       output pointer
+   * @param[in]  blockSize  number of samples to process
+   */
+  void arm_copy_q31(
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Fills a constant value into a floating-point vector.
+   * @param[in]  value      input value to be filled
+   * @param[out] pDst       output pointer
+   * @param[in]  blockSize  number of samples to process
+   */
+  void arm_fill_f32(
+        float32_t value,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Fills a constant value into a Q7 vector.
+   * @param[in]  value      input value to be filled
+   * @param[out] pDst       output pointer
+   * @param[in]  blockSize  number of samples to process
+   */
+  void arm_fill_q7(
+        q7_t value,
+        q7_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Fills a constant value into a Q15 vector.
+   * @param[in]  value      input value to be filled
+   * @param[out] pDst       output pointer
+   * @param[in]  blockSize  number of samples to process
+   */
+  void arm_fill_q15(
+        q15_t value,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Fills a constant value into a Q31 vector.
+   * @param[in]  value      input value to be filled
+   * @param[out] pDst       output pointer
+   * @param[in]  blockSize  number of samples to process
+   */
+  void arm_fill_q31(
+        q31_t value,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+/**
+ * @brief Convolution of floating-point sequences.
+ * @param[in]  pSrcA    points to the first input sequence.
+ * @param[in]  srcALen  length of the first input sequence.
+ * @param[in]  pSrcB    points to the second input sequence.
+ * @param[in]  srcBLen  length of the second input sequence.
+ * @param[out] pDst     points to the location where the output result is written.  Length srcALen+srcBLen-1.
+ */
+  void arm_conv_f32(
+  const float32_t * pSrcA,
+        uint32_t srcALen,
+  const float32_t * pSrcB,
+        uint32_t srcBLen,
+        float32_t * pDst);
+
+
+  /**
+   * @brief Convolution of Q15 sequences.
+   * @param[in]  pSrcA      points to the first input sequence.
+   * @param[in]  srcALen    length of the first input sequence.
+   * @param[in]  pSrcB      points to the second input sequence.
+   * @param[in]  srcBLen    length of the second input sequence.
+   * @param[out] pDst       points to the block of output data  Length srcALen+srcBLen-1.
+   * @param[in]  pScratch1  points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+   * @param[in]  pScratch2  points to scratch buffer of size min(srcALen, srcBLen).
+   */
+  void arm_conv_opt_q15(
+  const q15_t * pSrcA,
+        uint32_t srcALen,
+  const q15_t * pSrcB,
+        uint32_t srcBLen,
+        q15_t * pDst,
+        q15_t * pScratch1,
+        q15_t * pScratch2);
+
+
+/**
+ * @brief Convolution of Q15 sequences.
+ * @param[in]  pSrcA    points to the first input sequence.
+ * @param[in]  srcALen  length of the first input sequence.
+ * @param[in]  pSrcB    points to the second input sequence.
+ * @param[in]  srcBLen  length of the second input sequence.
+ * @param[out] pDst     points to the location where the output result is written.  Length srcALen+srcBLen-1.
+ */
+  void arm_conv_q15(
+  const q15_t * pSrcA,
+        uint32_t srcALen,
+  const q15_t * pSrcB,
+        uint32_t srcBLen,
+        q15_t * pDst);
+
+
+  /**
+   * @brief Convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4
+   * @param[in]  pSrcA    points to the first input sequence.
+   * @param[in]  srcALen  length of the first input sequence.
+   * @param[in]  pSrcB    points to the second input sequence.
+   * @param[in]  srcBLen  length of the second input sequence.
+   * @param[out] pDst     points to the block of output data  Length srcALen+srcBLen-1.
+   */
+  void arm_conv_fast_q15(
+  const q15_t * pSrcA,
+        uint32_t srcALen,
+  const q15_t * pSrcB,
+        uint32_t srcBLen,
+        q15_t * pDst);
+
+
+  /**
+   * @brief Convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4
+   * @param[in]  pSrcA      points to the first input sequence.
+   * @param[in]  srcALen    length of the first input sequence.
+   * @param[in]  pSrcB      points to the second input sequence.
+   * @param[in]  srcBLen    length of the second input sequence.
+   * @param[out] pDst       points to the block of output data  Length srcALen+srcBLen-1.
+   * @param[in]  pScratch1  points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+   * @param[in]  pScratch2  points to scratch buffer of size min(srcALen, srcBLen).
+   */
+  void arm_conv_fast_opt_q15(
+  const q15_t * pSrcA,
+        uint32_t srcALen,
+  const q15_t * pSrcB,
+        uint32_t srcBLen,
+        q15_t * pDst,
+        q15_t * pScratch1,
+        q15_t * pScratch2);
+
+
+  /**
+   * @brief Convolution of Q31 sequences.
+   * @param[in]  pSrcA    points to the first input sequence.
+   * @param[in]  srcALen  length of the first input sequence.
+   * @param[in]  pSrcB    points to the second input sequence.
+   * @param[in]  srcBLen  length of the second input sequence.
+   * @param[out] pDst     points to the block of output data  Length srcALen+srcBLen-1.
+   */
+  void arm_conv_q31(
+  const q31_t * pSrcA,
+        uint32_t srcALen,
+  const q31_t * pSrcB,
+        uint32_t srcBLen,
+        q31_t * pDst);
+
+
+  /**
+   * @brief Convolution of Q31 sequences (fast version) for Cortex-M3 and Cortex-M4
+   * @param[in]  pSrcA    points to the first input sequence.
+   * @param[in]  srcALen  length of the first input sequence.
+   * @param[in]  pSrcB    points to the second input sequence.
+   * @param[in]  srcBLen  length of the second input sequence.
+   * @param[out] pDst     points to the block of output data  Length srcALen+srcBLen-1.
+   */
+  void arm_conv_fast_q31(
+  const q31_t * pSrcA,
+        uint32_t srcALen,
+  const q31_t * pSrcB,
+        uint32_t srcBLen,
+        q31_t * pDst);
+
+
+    /**
+   * @brief Convolution of Q7 sequences.
+   * @param[in]  pSrcA      points to the first input sequence.
+   * @param[in]  srcALen    length of the first input sequence.
+   * @param[in]  pSrcB      points to the second input sequence.
+   * @param[in]  srcBLen    length of the second input sequence.
+   * @param[out] pDst       points to the block of output data  Length srcALen+srcBLen-1.
+   * @param[in]  pScratch1  points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+   * @param[in]  pScratch2  points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).
+   */
+  void arm_conv_opt_q7(
+  const q7_t * pSrcA,
+        uint32_t srcALen,
+  const q7_t * pSrcB,
+        uint32_t srcBLen,
+        q7_t * pDst,
+        q15_t * pScratch1,
+        q15_t * pScratch2);
+
+
+  /**
+   * @brief Convolution of Q7 sequences.
+   * @param[in]  pSrcA    points to the first input sequence.
+   * @param[in]  srcALen  length of the first input sequence.
+   * @param[in]  pSrcB    points to the second input sequence.
+   * @param[in]  srcBLen  length of the second input sequence.
+   * @param[out] pDst     points to the block of output data  Length srcALen+srcBLen-1.
+   */
+  void arm_conv_q7(
+  const q7_t * pSrcA,
+        uint32_t srcALen,
+  const q7_t * pSrcB,
+        uint32_t srcBLen,
+        q7_t * pDst);
+
+
+  /**
+   * @brief Partial convolution of floating-point sequences.
+   * @param[in]  pSrcA       points to the first input sequence.
+   * @param[in]  srcALen     length of the first input sequence.
+   * @param[in]  pSrcB       points to the second input sequence.
+   * @param[in]  srcBLen     length of the second input sequence.
+   * @param[out] pDst        points to the block of output data
+   * @param[in]  firstIndex  is the first output sample to start with.
+   * @param[in]  numPoints   is the number of output points to be computed.
+   * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
+   */
+  arm_status arm_conv_partial_f32(
+  const float32_t * pSrcA,
+        uint32_t srcALen,
+  const float32_t * pSrcB,
+        uint32_t srcBLen,
+        float32_t * pDst,
+        uint32_t firstIndex,
+        uint32_t numPoints);
+
+
+  /**
+   * @brief Partial convolution of Q15 sequences.
+   * @param[in]  pSrcA       points to the first input sequence.
+   * @param[in]  srcALen     length of the first input sequence.
+   * @param[in]  pSrcB       points to the second input sequence.
+   * @param[in]  srcBLen     length of the second input sequence.
+   * @param[out] pDst        points to the block of output data
+   * @param[in]  firstIndex  is the first output sample to start with.
+   * @param[in]  numPoints   is the number of output points to be computed.
+   * @param[in]  pScratch1   points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+   * @param[in]  pScratch2   points to scratch buffer of size min(srcALen, srcBLen).
+   * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
+   */
+  arm_status arm_conv_partial_opt_q15(
+  const q15_t * pSrcA,
+        uint32_t srcALen,
+  const q15_t * pSrcB,
+        uint32_t srcBLen,
+        q15_t * pDst,
+        uint32_t firstIndex,
+        uint32_t numPoints,
+        q15_t * pScratch1,
+        q15_t * pScratch2);
+
+
+  /**
+   * @brief Partial convolution of Q15 sequences.
+   * @param[in]  pSrcA       points to the first input sequence.
+   * @param[in]  srcALen     length of the first input sequence.
+   * @param[in]  pSrcB       points to the second input sequence.
+   * @param[in]  srcBLen     length of the second input sequence.
+   * @param[out] pDst        points to the block of output data
+   * @param[in]  firstIndex  is the first output sample to start with.
+   * @param[in]  numPoints   is the number of output points to be computed.
+   * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
+   */
+  arm_status arm_conv_partial_q15(
+  const q15_t * pSrcA,
+        uint32_t srcALen,
+  const q15_t * pSrcB,
+        uint32_t srcBLen,
+        q15_t * pDst,
+        uint32_t firstIndex,
+        uint32_t numPoints);
+
+
+  /**
+   * @brief Partial convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4
+   * @param[in]  pSrcA       points to the first input sequence.
+   * @param[in]  srcALen     length of the first input sequence.
+   * @param[in]  pSrcB       points to the second input sequence.
+   * @param[in]  srcBLen     length of the second input sequence.
+   * @param[out] pDst        points to the block of output data
+   * @param[in]  firstIndex  is the first output sample to start with.
+   * @param[in]  numPoints   is the number of output points to be computed.
+   * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
+   */
+  arm_status arm_conv_partial_fast_q15(
+  const q15_t * pSrcA,
+        uint32_t srcALen,
+  const q15_t * pSrcB,
+        uint32_t srcBLen,
+        q15_t * pDst,
+        uint32_t firstIndex,
+        uint32_t numPoints);
+
+
+  /**
+   * @brief Partial convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4
+   * @param[in]  pSrcA       points to the first input sequence.
+   * @param[in]  srcALen     length of the first input sequence.
+   * @param[in]  pSrcB       points to the second input sequence.
+   * @param[in]  srcBLen     length of the second input sequence.
+   * @param[out] pDst        points to the block of output data
+   * @param[in]  firstIndex  is the first output sample to start with.
+   * @param[in]  numPoints   is the number of output points to be computed.
+   * @param[in]  pScratch1   points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+   * @param[in]  pScratch2   points to scratch buffer of size min(srcALen, srcBLen).
+   * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
+   */
+  arm_status arm_conv_partial_fast_opt_q15(
+  const q15_t * pSrcA,
+        uint32_t srcALen,
+  const q15_t * pSrcB,
+        uint32_t srcBLen,
+        q15_t * pDst,
+        uint32_t firstIndex,
+        uint32_t numPoints,
+        q15_t * pScratch1,
+        q15_t * pScratch2);
+
+
+  /**
+   * @brief Partial convolution of Q31 sequences.
+   * @param[in]  pSrcA       points to the first input sequence.
+   * @param[in]  srcALen     length of the first input sequence.
+   * @param[in]  pSrcB       points to the second input sequence.
+   * @param[in]  srcBLen     length of the second input sequence.
+   * @param[out] pDst        points to the block of output data
+   * @param[in]  firstIndex  is the first output sample to start with.
+   * @param[in]  numPoints   is the number of output points to be computed.
+   * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
+   */
+  arm_status arm_conv_partial_q31(
+  const q31_t * pSrcA,
+        uint32_t srcALen,
+  const q31_t * pSrcB,
+        uint32_t srcBLen,
+        q31_t * pDst,
+        uint32_t firstIndex,
+        uint32_t numPoints);
+
+
+  /**
+   * @brief Partial convolution of Q31 sequences (fast version) for Cortex-M3 and Cortex-M4
+   * @param[in]  pSrcA       points to the first input sequence.
+   * @param[in]  srcALen     length of the first input sequence.
+   * @param[in]  pSrcB       points to the second input sequence.
+   * @param[in]  srcBLen     length of the second input sequence.
+   * @param[out] pDst        points to the block of output data
+   * @param[in]  firstIndex  is the first output sample to start with.
+   * @param[in]  numPoints   is the number of output points to be computed.
+   * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
+   */
+  arm_status arm_conv_partial_fast_q31(
+  const q31_t * pSrcA,
+        uint32_t srcALen,
+  const q31_t * pSrcB,
+        uint32_t srcBLen,
+        q31_t * pDst,
+        uint32_t firstIndex,
+        uint32_t numPoints);
+
+
+  /**
+   * @brief Partial convolution of Q7 sequences
+   * @param[in]  pSrcA       points to the first input sequence.
+   * @param[in]  srcALen     length of the first input sequence.
+   * @param[in]  pSrcB       points to the second input sequence.
+   * @param[in]  srcBLen     length of the second input sequence.
+   * @param[out] pDst        points to the block of output data
+   * @param[in]  firstIndex  is the first output sample to start with.
+   * @param[in]  numPoints   is the number of output points to be computed.
+   * @param[in]  pScratch1   points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+   * @param[in]  pScratch2   points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).
+   * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
+   */
+  arm_status arm_conv_partial_opt_q7(
+  const q7_t * pSrcA,
+        uint32_t srcALen,
+  const q7_t * pSrcB,
+        uint32_t srcBLen,
+        q7_t * pDst,
+        uint32_t firstIndex,
+        uint32_t numPoints,
+        q15_t * pScratch1,
+        q15_t * pScratch2);
+
+
+/**
+   * @brief Partial convolution of Q7 sequences.
+   * @param[in]  pSrcA       points to the first input sequence.
+   * @param[in]  srcALen     length of the first input sequence.
+   * @param[in]  pSrcB       points to the second input sequence.
+   * @param[in]  srcBLen     length of the second input sequence.
+   * @param[out] pDst        points to the block of output data
+   * @param[in]  firstIndex  is the first output sample to start with.
+   * @param[in]  numPoints   is the number of output points to be computed.
+   * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
+   */
+  arm_status arm_conv_partial_q7(
+  const q7_t * pSrcA,
+        uint32_t srcALen,
+  const q7_t * pSrcB,
+        uint32_t srcBLen,
+        q7_t * pDst,
+        uint32_t firstIndex,
+        uint32_t numPoints);
+
+
+  /**
+   * @brief Instance structure for the Q15 FIR decimator.
+   */
+  typedef struct
+  {
+          uint8_t M;                  /**< decimation factor. */
+          uint16_t numTaps;           /**< number of coefficients in the filter. */
+    const q15_t *pCoeffs;             /**< points to the coefficient array. The array is of length numTaps.*/
+          q15_t *pState;              /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
+  } arm_fir_decimate_instance_q15;
+
+  /**
+   * @brief Instance structure for the Q31 FIR decimator.
+   */
+  typedef struct
+  {
+          uint8_t M;                  /**< decimation factor. */
+          uint16_t numTaps;           /**< number of coefficients in the filter. */
+    const q31_t *pCoeffs;             /**< points to the coefficient array. The array is of length numTaps.*/
+          q31_t *pState;              /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
+  } arm_fir_decimate_instance_q31;
+
+/**
+  @brief Instance structure for floating-point FIR decimator.
+ */
+typedef struct
+  {
+          uint8_t M;                  /**< decimation factor. */
+          uint16_t numTaps;           /**< number of coefficients in the filter. */
+    const float32_t *pCoeffs;         /**< points to the coefficient array. The array is of length numTaps.*/
+          float32_t *pState;          /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
+  } arm_fir_decimate_instance_f32;
+
+
+/**
+  @brief         Processing function for floating-point FIR decimator.
+  @param[in]     S         points to an instance of the floating-point FIR decimator structure
+  @param[in]     pSrc      points to the block of input data
+  @param[out]    pDst      points to the block of output data
+  @param[in]     blockSize number of samples to process
+ */
+void arm_fir_decimate_f32(
+  const arm_fir_decimate_instance_f32 * S,
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+/**
+  @brief         Initialization function for the floating-point FIR decimator.
+  @param[in,out] S          points to an instance of the floating-point FIR decimator structure
+  @param[in]     numTaps    number of coefficients in the filter
+  @param[in]     M          decimation factor
+  @param[in]     pCoeffs    points to the filter coefficients
+  @param[in]     pState     points to the state buffer
+  @param[in]     blockSize  number of input samples to process per call
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS      : Operation successful
+                   - \ref ARM_MATH_LENGTH_ERROR : <code>blockSize</code> is not a multiple of <code>M</code>
+ */
+arm_status arm_fir_decimate_init_f32(
+        arm_fir_decimate_instance_f32 * S,
+        uint16_t numTaps,
+        uint8_t M,
+  const float32_t * pCoeffs,
+        float32_t * pState,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Processing function for the Q15 FIR decimator.
+   * @param[in]  S          points to an instance of the Q15 FIR decimator structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data
+   * @param[in]  blockSize  number of input samples to process per call.
+   */
+  void arm_fir_decimate_q15(
+  const arm_fir_decimate_instance_q15 * S,
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Processing function for the Q15 FIR decimator (fast variant) for Cortex-M3 and Cortex-M4.
+   * @param[in]  S          points to an instance of the Q15 FIR decimator structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data
+   * @param[in]  blockSize  number of input samples to process per call.
+   */
+  void arm_fir_decimate_fast_q15(
+  const arm_fir_decimate_instance_q15 * S,
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Initialization function for the Q15 FIR decimator.
+   * @param[in,out] S          points to an instance of the Q15 FIR decimator structure.
+   * @param[in]     numTaps    number of coefficients in the filter.
+   * @param[in]     M          decimation factor.
+   * @param[in]     pCoeffs    points to the filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     blockSize  number of input samples to process per call.
+   * @return    The function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_LENGTH_ERROR if
+   * <code>blockSize</code> is not a multiple of <code>M</code>.
+   */
+  arm_status arm_fir_decimate_init_q15(
+        arm_fir_decimate_instance_q15 * S,
+        uint16_t numTaps,
+        uint8_t M,
+  const q15_t * pCoeffs,
+        q15_t * pState,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Processing function for the Q31 FIR decimator.
+   * @param[in]  S     points to an instance of the Q31 FIR decimator structure.
+   * @param[in]  pSrc  points to the block of input data.
+   * @param[out] pDst  points to the block of output data
+   * @param[in] blockSize number of input samples to process per call.
+   */
+  void arm_fir_decimate_q31(
+  const arm_fir_decimate_instance_q31 * S,
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+  /**
+   * @brief Processing function for the Q31 FIR decimator (fast variant) for Cortex-M3 and Cortex-M4.
+   * @param[in]  S          points to an instance of the Q31 FIR decimator structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data
+   * @param[in]  blockSize  number of input samples to process per call.
+   */
+  void arm_fir_decimate_fast_q31(
+  const arm_fir_decimate_instance_q31 * S,
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Initialization function for the Q31 FIR decimator.
+   * @param[in,out] S          points to an instance of the Q31 FIR decimator structure.
+   * @param[in]     numTaps    number of coefficients in the filter.
+   * @param[in]     M          decimation factor.
+   * @param[in]     pCoeffs    points to the filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     blockSize  number of input samples to process per call.
+   * @return    The function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_LENGTH_ERROR if
+   * <code>blockSize</code> is not a multiple of <code>M</code>.
+   */
+  arm_status arm_fir_decimate_init_q31(
+        arm_fir_decimate_instance_q31 * S,
+        uint16_t numTaps,
+        uint8_t M,
+  const q31_t * pCoeffs,
+        q31_t * pState,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Instance structure for the Q15 FIR interpolator.
+   */
+  typedef struct
+  {
+        uint8_t L;                      /**< upsample factor. */
+        uint16_t phaseLength;           /**< length of each polyphase filter component. */
+  const q15_t *pCoeffs;                 /**< points to the coefficient array. The array is of length L*phaseLength. */
+        q15_t *pState;                  /**< points to the state variable array. The array is of length blockSize+phaseLength-1. */
+  } arm_fir_interpolate_instance_q15;
+
+  /**
+   * @brief Instance structure for the Q31 FIR interpolator.
+   */
+  typedef struct
+  {
+        uint8_t L;                      /**< upsample factor. */
+        uint16_t phaseLength;           /**< length of each polyphase filter component. */
+  const q31_t *pCoeffs;                 /**< points to the coefficient array. The array is of length L*phaseLength. */
+        q31_t *pState;                  /**< points to the state variable array. The array is of length blockSize+phaseLength-1. */
+  } arm_fir_interpolate_instance_q31;
+
+  /**
+   * @brief Instance structure for the floating-point FIR interpolator.
+   */
+  typedef struct
+  {
+        uint8_t L;                     /**< upsample factor. */
+        uint16_t phaseLength;          /**< length of each polyphase filter component. */
+  const float32_t *pCoeffs;            /**< points to the coefficient array. The array is of length L*phaseLength. */
+        float32_t *pState;             /**< points to the state variable array. The array is of length phaseLength+numTaps-1. */
+  } arm_fir_interpolate_instance_f32;
+
+
+  /**
+   * @brief Processing function for the Q15 FIR interpolator.
+   * @param[in]  S          points to an instance of the Q15 FIR interpolator structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of input samples to process per call.
+   */
+  void arm_fir_interpolate_q15(
+  const arm_fir_interpolate_instance_q15 * S,
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Initialization function for the Q15 FIR interpolator.
+   * @param[in,out] S          points to an instance of the Q15 FIR interpolator structure.
+   * @param[in]     L          upsample factor.
+   * @param[in]     numTaps    number of filter coefficients in the filter.
+   * @param[in]     pCoeffs    points to the filter coefficient buffer.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     blockSize  number of input samples to process per call.
+   * @return        The function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_LENGTH_ERROR if
+   * the filter length <code>numTaps</code> is not a multiple of the interpolation factor <code>L</code>.
+   */
+  arm_status arm_fir_interpolate_init_q15(
+        arm_fir_interpolate_instance_q15 * S,
+        uint8_t L,
+        uint16_t numTaps,
+  const q15_t * pCoeffs,
+        q15_t * pState,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Processing function for the Q31 FIR interpolator.
+   * @param[in]  S          points to an instance of the Q15 FIR interpolator structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of input samples to process per call.
+   */
+  void arm_fir_interpolate_q31(
+  const arm_fir_interpolate_instance_q31 * S,
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Initialization function for the Q31 FIR interpolator.
+   * @param[in,out] S          points to an instance of the Q31 FIR interpolator structure.
+   * @param[in]     L          upsample factor.
+   * @param[in]     numTaps    number of filter coefficients in the filter.
+   * @param[in]     pCoeffs    points to the filter coefficient buffer.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     blockSize  number of input samples to process per call.
+   * @return        The function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_LENGTH_ERROR if
+   * the filter length <code>numTaps</code> is not a multiple of the interpolation factor <code>L</code>.
+   */
+  arm_status arm_fir_interpolate_init_q31(
+        arm_fir_interpolate_instance_q31 * S,
+        uint8_t L,
+        uint16_t numTaps,
+  const q31_t * pCoeffs,
+        q31_t * pState,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Processing function for the floating-point FIR interpolator.
+   * @param[in]  S          points to an instance of the floating-point FIR interpolator structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of input samples to process per call.
+   */
+  void arm_fir_interpolate_f32(
+  const arm_fir_interpolate_instance_f32 * S,
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Initialization function for the floating-point FIR interpolator.
+   * @param[in,out] S          points to an instance of the floating-point FIR interpolator structure.
+   * @param[in]     L          upsample factor.
+   * @param[in]     numTaps    number of filter coefficients in the filter.
+   * @param[in]     pCoeffs    points to the filter coefficient buffer.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     blockSize  number of input samples to process per call.
+   * @return        The function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_LENGTH_ERROR if
+   * the filter length <code>numTaps</code> is not a multiple of the interpolation factor <code>L</code>.
+   */
+  arm_status arm_fir_interpolate_init_f32(
+        arm_fir_interpolate_instance_f32 * S,
+        uint8_t L,
+        uint16_t numTaps,
+  const float32_t * pCoeffs,
+        float32_t * pState,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Instance structure for the high precision Q31 Biquad cascade filter.
+   */
+  typedef struct
+  {
+          uint8_t numStages;       /**< number of 2nd order stages in the filter.  Overall order is 2*numStages. */
+          q63_t *pState;           /**< points to the array of state coefficients.  The array is of length 4*numStages. */
+    const q31_t *pCoeffs;          /**< points to the array of coefficients.  The array is of length 5*numStages. */
+          uint8_t postShift;       /**< additional shift, in bits, applied to each output sample. */
+  } arm_biquad_cas_df1_32x64_ins_q31;
+
+
+  /**
+   * @param[in]  S          points to an instance of the high precision Q31 Biquad cascade filter structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_biquad_cas_df1_32x64_q31(
+  const arm_biquad_cas_df1_32x64_ins_q31 * S,
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @param[in,out] S          points to an instance of the high precision Q31 Biquad cascade filter structure.
+   * @param[in]     numStages  number of 2nd order stages in the filter.
+   * @param[in]     pCoeffs    points to the filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     postShift  shift to be applied to the output. Varies according to the coefficients format
+   */
+  void arm_biquad_cas_df1_32x64_init_q31(
+        arm_biquad_cas_df1_32x64_ins_q31 * S,
+        uint8_t numStages,
+  const q31_t * pCoeffs,
+        q63_t * pState,
+        uint8_t postShift);
+
+
+  /**
+   * @brief Instance structure for the floating-point transposed direct form II Biquad cascade filter.
+   */
+  typedef struct
+  {
+          uint8_t numStages;         /**< number of 2nd order stages in the filter.  Overall order is 2*numStages. */
+          float32_t *pState;         /**< points to the array of state coefficients.  The array is of length 2*numStages. */
+    const float32_t *pCoeffs;        /**< points to the array of coefficients.  The array is of length 5*numStages. */
+  } arm_biquad_cascade_df2T_instance_f32;
+
+  /**
+   * @brief Instance structure for the floating-point transposed direct form II Biquad cascade filter.
+   */
+  typedef struct
+  {
+          uint8_t numStages;         /**< number of 2nd order stages in the filter.  Overall order is 2*numStages. */
+          float32_t *pState;         /**< points to the array of state coefficients.  The array is of length 4*numStages. */
+    const float32_t *pCoeffs;        /**< points to the array of coefficients.  The array is of length 5*numStages. */
+  } arm_biquad_cascade_stereo_df2T_instance_f32;
+
+  /**
+   * @brief Instance structure for the floating-point transposed direct form II Biquad cascade filter.
+   */
+  typedef struct
+  {
+          uint8_t numStages;         /**< number of 2nd order stages in the filter.  Overall order is 2*numStages. */
+          float64_t *pState;         /**< points to the array of state coefficients.  The array is of length 2*numStages. */
+    const float64_t *pCoeffs;        /**< points to the array of coefficients.  The array is of length 5*numStages. */
+  } arm_biquad_cascade_df2T_instance_f64;
+
+
+  /**
+   * @brief Processing function for the floating-point transposed direct form II Biquad cascade filter.
+   * @param[in]  S          points to an instance of the filter data structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_biquad_cascade_df2T_f32(
+  const arm_biquad_cascade_df2T_instance_f32 * S,
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Processing function for the floating-point transposed direct form II Biquad cascade filter. 2 channels
+   * @param[in]  S          points to an instance of the filter data structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_biquad_cascade_stereo_df2T_f32(
+  const arm_biquad_cascade_stereo_df2T_instance_f32 * S,
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Processing function for the floating-point transposed direct form II Biquad cascade filter.
+   * @param[in]  S          points to an instance of the filter data structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_biquad_cascade_df2T_f64(
+  const arm_biquad_cascade_df2T_instance_f64 * S,
+  const float64_t * pSrc,
+        float64_t * pDst,
+        uint32_t blockSize);
+
+
+#if defined(ARM_MATH_NEON)
+void arm_biquad_cascade_df2T_compute_coefs_f32(
+  arm_biquad_cascade_df2T_instance_f32 * S,
+  uint8_t numStages,
+  float32_t * pCoeffs);
+#endif
+  /**
+   * @brief  Initialization function for the floating-point transposed direct form II Biquad cascade filter.
+   * @param[in,out] S          points to an instance of the filter data structure.
+   * @param[in]     numStages  number of 2nd order stages in the filter.
+   * @param[in]     pCoeffs    points to the filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   */
+  void arm_biquad_cascade_df2T_init_f32(
+        arm_biquad_cascade_df2T_instance_f32 * S,
+        uint8_t numStages,
+  const float32_t * pCoeffs,
+        float32_t * pState);
+
+
+  /**
+   * @brief  Initialization function for the floating-point transposed direct form II Biquad cascade filter.
+   * @param[in,out] S          points to an instance of the filter data structure.
+   * @param[in]     numStages  number of 2nd order stages in the filter.
+   * @param[in]     pCoeffs    points to the filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   */
+  void arm_biquad_cascade_stereo_df2T_init_f32(
+        arm_biquad_cascade_stereo_df2T_instance_f32 * S,
+        uint8_t numStages,
+  const float32_t * pCoeffs,
+        float32_t * pState);
+
+
+  /**
+   * @brief  Initialization function for the floating-point transposed direct form II Biquad cascade filter.
+   * @param[in,out] S          points to an instance of the filter data structure.
+   * @param[in]     numStages  number of 2nd order stages in the filter.
+   * @param[in]     pCoeffs    points to the filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   */
+  void arm_biquad_cascade_df2T_init_f64(
+        arm_biquad_cascade_df2T_instance_f64 * S,
+        uint8_t numStages,
+        const float64_t * pCoeffs,
+        float64_t * pState);
+
+
+  /**
+   * @brief Instance structure for the Q15 FIR lattice filter.
+   */
+  typedef struct
+  {
+          uint16_t numStages;                  /**< number of filter stages. */
+          q15_t *pState;                       /**< points to the state variable array. The array is of length numStages. */
+    const q15_t *pCoeffs;                      /**< points to the coefficient array. The array is of length numStages. */
+  } arm_fir_lattice_instance_q15;
+
+  /**
+   * @brief Instance structure for the Q31 FIR lattice filter.
+   */
+  typedef struct
+  {
+          uint16_t numStages;                  /**< number of filter stages. */
+          q31_t *pState;                       /**< points to the state variable array. The array is of length numStages. */
+    const q31_t *pCoeffs;                      /**< points to the coefficient array. The array is of length numStages. */
+  } arm_fir_lattice_instance_q31;
+
+  /**
+   * @brief Instance structure for the floating-point FIR lattice filter.
+   */
+  typedef struct
+  {
+          uint16_t numStages;                  /**< number of filter stages. */
+          float32_t *pState;                   /**< points to the state variable array. The array is of length numStages. */
+    const float32_t *pCoeffs;                  /**< points to the coefficient array. The array is of length numStages. */
+  } arm_fir_lattice_instance_f32;
+
+
+  /**
+   * @brief Initialization function for the Q15 FIR lattice filter.
+   * @param[in] S          points to an instance of the Q15 FIR lattice structure.
+   * @param[in] numStages  number of filter stages.
+   * @param[in] pCoeffs    points to the coefficient buffer.  The array is of length numStages.
+   * @param[in] pState     points to the state buffer.  The array is of length numStages.
+   */
+  void arm_fir_lattice_init_q15(
+        arm_fir_lattice_instance_q15 * S,
+        uint16_t numStages,
+  const q15_t * pCoeffs,
+        q15_t * pState);
+
+
+  /**
+   * @brief Processing function for the Q15 FIR lattice filter.
+   * @param[in]  S          points to an instance of the Q15 FIR lattice structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_fir_lattice_q15(
+  const arm_fir_lattice_instance_q15 * S,
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Initialization function for the Q31 FIR lattice filter.
+   * @param[in] S          points to an instance of the Q31 FIR lattice structure.
+   * @param[in] numStages  number of filter stages.
+   * @param[in] pCoeffs    points to the coefficient buffer.  The array is of length numStages.
+   * @param[in] pState     points to the state buffer.   The array is of length numStages.
+   */
+  void arm_fir_lattice_init_q31(
+        arm_fir_lattice_instance_q31 * S,
+        uint16_t numStages,
+  const q31_t * pCoeffs,
+        q31_t * pState);
+
+
+  /**
+   * @brief Processing function for the Q31 FIR lattice filter.
+   * @param[in]  S          points to an instance of the Q31 FIR lattice structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_fir_lattice_q31(
+  const arm_fir_lattice_instance_q31 * S,
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+/**
+ * @brief Initialization function for the floating-point FIR lattice filter.
+ * @param[in] S          points to an instance of the floating-point FIR lattice structure.
+ * @param[in] numStages  number of filter stages.
+ * @param[in] pCoeffs    points to the coefficient buffer.  The array is of length numStages.
+ * @param[in] pState     points to the state buffer.  The array is of length numStages.
+ */
+  void arm_fir_lattice_init_f32(
+        arm_fir_lattice_instance_f32 * S,
+        uint16_t numStages,
+  const float32_t * pCoeffs,
+        float32_t * pState);
+
+
+  /**
+   * @brief Processing function for the floating-point FIR lattice filter.
+   * @param[in]  S          points to an instance of the floating-point FIR lattice structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_fir_lattice_f32(
+  const arm_fir_lattice_instance_f32 * S,
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Instance structure for the Q15 IIR lattice filter.
+   */
+  typedef struct
+  {
+          uint16_t numStages;                  /**< number of stages in the filter. */
+          q15_t *pState;                       /**< points to the state variable array. The array is of length numStages+blockSize. */
+          q15_t *pkCoeffs;                     /**< points to the reflection coefficient array. The array is of length numStages. */
+          q15_t *pvCoeffs;                     /**< points to the ladder coefficient array. The array is of length numStages+1. */
+  } arm_iir_lattice_instance_q15;
+
+  /**
+   * @brief Instance structure for the Q31 IIR lattice filter.
+   */
+  typedef struct
+  {
+          uint16_t numStages;                  /**< number of stages in the filter. */
+          q31_t *pState;                       /**< points to the state variable array. The array is of length numStages+blockSize. */
+          q31_t *pkCoeffs;                     /**< points to the reflection coefficient array. The array is of length numStages. */
+          q31_t *pvCoeffs;                     /**< points to the ladder coefficient array. The array is of length numStages+1. */
+  } arm_iir_lattice_instance_q31;
+
+  /**
+   * @brief Instance structure for the floating-point IIR lattice filter.
+   */
+  typedef struct
+  {
+          uint16_t numStages;                  /**< number of stages in the filter. */
+          float32_t *pState;                   /**< points to the state variable array. The array is of length numStages+blockSize. */
+          float32_t *pkCoeffs;                 /**< points to the reflection coefficient array. The array is of length numStages. */
+          float32_t *pvCoeffs;                 /**< points to the ladder coefficient array. The array is of length numStages+1. */
+  } arm_iir_lattice_instance_f32;
+
+
+  /**
+   * @brief Processing function for the floating-point IIR lattice filter.
+   * @param[in]  S          points to an instance of the floating-point IIR lattice structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_iir_lattice_f32(
+  const arm_iir_lattice_instance_f32 * S,
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Initialization function for the floating-point IIR lattice filter.
+   * @param[in] S          points to an instance of the floating-point IIR lattice structure.
+   * @param[in] numStages  number of stages in the filter.
+   * @param[in] pkCoeffs   points to the reflection coefficient buffer.  The array is of length numStages.
+   * @param[in] pvCoeffs   points to the ladder coefficient buffer.  The array is of length numStages+1.
+   * @param[in] pState     points to the state buffer.  The array is of length numStages+blockSize-1.
+   * @param[in] blockSize  number of samples to process.
+   */
+  void arm_iir_lattice_init_f32(
+        arm_iir_lattice_instance_f32 * S,
+        uint16_t numStages,
+        float32_t * pkCoeffs,
+        float32_t * pvCoeffs,
+        float32_t * pState,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Processing function for the Q31 IIR lattice filter.
+   * @param[in]  S          points to an instance of the Q31 IIR lattice structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_iir_lattice_q31(
+  const arm_iir_lattice_instance_q31 * S,
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Initialization function for the Q31 IIR lattice filter.
+   * @param[in] S          points to an instance of the Q31 IIR lattice structure.
+   * @param[in] numStages  number of stages in the filter.
+   * @param[in] pkCoeffs   points to the reflection coefficient buffer.  The array is of length numStages.
+   * @param[in] pvCoeffs   points to the ladder coefficient buffer.  The array is of length numStages+1.
+   * @param[in] pState     points to the state buffer.  The array is of length numStages+blockSize.
+   * @param[in] blockSize  number of samples to process.
+   */
+  void arm_iir_lattice_init_q31(
+        arm_iir_lattice_instance_q31 * S,
+        uint16_t numStages,
+        q31_t * pkCoeffs,
+        q31_t * pvCoeffs,
+        q31_t * pState,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Processing function for the Q15 IIR lattice filter.
+   * @param[in]  S          points to an instance of the Q15 IIR lattice structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_iir_lattice_q15(
+  const arm_iir_lattice_instance_q15 * S,
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+/**
+ * @brief Initialization function for the Q15 IIR lattice filter.
+ * @param[in] S          points to an instance of the fixed-point Q15 IIR lattice structure.
+ * @param[in] numStages  number of stages in the filter.
+ * @param[in] pkCoeffs   points to reflection coefficient buffer.  The array is of length numStages.
+ * @param[in] pvCoeffs   points to ladder coefficient buffer.  The array is of length numStages+1.
+ * @param[in] pState     points to state buffer.  The array is of length numStages+blockSize.
+ * @param[in] blockSize  number of samples to process per call.
+ */
+  void arm_iir_lattice_init_q15(
+        arm_iir_lattice_instance_q15 * S,
+        uint16_t numStages,
+        q15_t * pkCoeffs,
+        q15_t * pvCoeffs,
+        q15_t * pState,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Instance structure for the floating-point LMS filter.
+   */
+  typedef struct
+  {
+          uint16_t numTaps;    /**< number of coefficients in the filter. */
+          float32_t *pState;   /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
+          float32_t *pCoeffs;  /**< points to the coefficient array. The array is of length numTaps. */
+          float32_t mu;        /**< step size that controls filter coefficient updates. */
+  } arm_lms_instance_f32;
+
+
+  /**
+   * @brief Processing function for floating-point LMS filter.
+   * @param[in]  S          points to an instance of the floating-point LMS filter structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[in]  pRef       points to the block of reference data.
+   * @param[out] pOut       points to the block of output data.
+   * @param[out] pErr       points to the block of error data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_lms_f32(
+  const arm_lms_instance_f32 * S,
+  const float32_t * pSrc,
+        float32_t * pRef,
+        float32_t * pOut,
+        float32_t * pErr,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Initialization function for floating-point LMS filter.
+   * @param[in] S          points to an instance of the floating-point LMS filter structure.
+   * @param[in] numTaps    number of filter coefficients.
+   * @param[in] pCoeffs    points to the coefficient buffer.
+   * @param[in] pState     points to state buffer.
+   * @param[in] mu         step size that controls filter coefficient updates.
+   * @param[in] blockSize  number of samples to process.
+   */
+  void arm_lms_init_f32(
+        arm_lms_instance_f32 * S,
+        uint16_t numTaps,
+        float32_t * pCoeffs,
+        float32_t * pState,
+        float32_t mu,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Instance structure for the Q15 LMS filter.
+   */
+  typedef struct
+  {
+          uint16_t numTaps;    /**< number of coefficients in the filter. */
+          q15_t *pState;       /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
+          q15_t *pCoeffs;      /**< points to the coefficient array. The array is of length numTaps. */
+          q15_t mu;            /**< step size that controls filter coefficient updates. */
+          uint32_t postShift;  /**< bit shift applied to coefficients. */
+  } arm_lms_instance_q15;
+
+
+  /**
+   * @brief Initialization function for the Q15 LMS filter.
+   * @param[in] S          points to an instance of the Q15 LMS filter structure.
+   * @param[in] numTaps    number of filter coefficients.
+   * @param[in] pCoeffs    points to the coefficient buffer.
+   * @param[in] pState     points to the state buffer.
+   * @param[in] mu         step size that controls filter coefficient updates.
+   * @param[in] blockSize  number of samples to process.
+   * @param[in] postShift  bit shift applied to coefficients.
+   */
+  void arm_lms_init_q15(
+        arm_lms_instance_q15 * S,
+        uint16_t numTaps,
+        q15_t * pCoeffs,
+        q15_t * pState,
+        q15_t mu,
+        uint32_t blockSize,
+        uint32_t postShift);
+
+
+  /**
+   * @brief Processing function for Q15 LMS filter.
+   * @param[in]  S          points to an instance of the Q15 LMS filter structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[in]  pRef       points to the block of reference data.
+   * @param[out] pOut       points to the block of output data.
+   * @param[out] pErr       points to the block of error data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_lms_q15(
+  const arm_lms_instance_q15 * S,
+  const q15_t * pSrc,
+        q15_t * pRef,
+        q15_t * pOut,
+        q15_t * pErr,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Instance structure for the Q31 LMS filter.
+   */
+  typedef struct
+  {
+          uint16_t numTaps;    /**< number of coefficients in the filter. */
+          q31_t *pState;       /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
+          q31_t *pCoeffs;      /**< points to the coefficient array. The array is of length numTaps. */
+          q31_t mu;            /**< step size that controls filter coefficient updates. */
+          uint32_t postShift;  /**< bit shift applied to coefficients. */
+  } arm_lms_instance_q31;
+
+
+  /**
+   * @brief Processing function for Q31 LMS filter.
+   * @param[in]  S          points to an instance of the Q15 LMS filter structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[in]  pRef       points to the block of reference data.
+   * @param[out] pOut       points to the block of output data.
+   * @param[out] pErr       points to the block of error data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_lms_q31(
+  const arm_lms_instance_q31 * S,
+  const q31_t * pSrc,
+        q31_t * pRef,
+        q31_t * pOut,
+        q31_t * pErr,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Initialization function for Q31 LMS filter.
+   * @param[in] S          points to an instance of the Q31 LMS filter structure.
+   * @param[in] numTaps    number of filter coefficients.
+   * @param[in] pCoeffs    points to coefficient buffer.
+   * @param[in] pState     points to state buffer.
+   * @param[in] mu         step size that controls filter coefficient updates.
+   * @param[in] blockSize  number of samples to process.
+   * @param[in] postShift  bit shift applied to coefficients.
+   */
+  void arm_lms_init_q31(
+        arm_lms_instance_q31 * S,
+        uint16_t numTaps,
+        q31_t * pCoeffs,
+        q31_t * pState,
+        q31_t mu,
+        uint32_t blockSize,
+        uint32_t postShift);
+
+
+  /**
+   * @brief Instance structure for the floating-point normalized LMS filter.
+   */
+  typedef struct
+  {
+          uint16_t numTaps;     /**< number of coefficients in the filter. */
+          float32_t *pState;    /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
+          float32_t *pCoeffs;   /**< points to the coefficient array. The array is of length numTaps. */
+          float32_t mu;         /**< step size that control filter coefficient updates. */
+          float32_t energy;     /**< saves previous frame energy. */
+          float32_t x0;         /**< saves previous input sample. */
+  } arm_lms_norm_instance_f32;
+
+
+  /**
+   * @brief Processing function for floating-point normalized LMS filter.
+   * @param[in]  S          points to an instance of the floating-point normalized LMS filter structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[in]  pRef       points to the block of reference data.
+   * @param[out] pOut       points to the block of output data.
+   * @param[out] pErr       points to the block of error data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_lms_norm_f32(
+        arm_lms_norm_instance_f32 * S,
+  const float32_t * pSrc,
+        float32_t * pRef,
+        float32_t * pOut,
+        float32_t * pErr,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Initialization function for floating-point normalized LMS filter.
+   * @param[in] S          points to an instance of the floating-point LMS filter structure.
+   * @param[in] numTaps    number of filter coefficients.
+   * @param[in] pCoeffs    points to coefficient buffer.
+   * @param[in] pState     points to state buffer.
+   * @param[in] mu         step size that controls filter coefficient updates.
+   * @param[in] blockSize  number of samples to process.
+   */
+  void arm_lms_norm_init_f32(
+        arm_lms_norm_instance_f32 * S,
+        uint16_t numTaps,
+        float32_t * pCoeffs,
+        float32_t * pState,
+        float32_t mu,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Instance structure for the Q31 normalized LMS filter.
+   */
+  typedef struct
+  {
+          uint16_t numTaps;     /**< number of coefficients in the filter. */
+          q31_t *pState;        /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
+          q31_t *pCoeffs;       /**< points to the coefficient array. The array is of length numTaps. */
+          q31_t mu;             /**< step size that controls filter coefficient updates. */
+          uint8_t postShift;    /**< bit shift applied to coefficients. */
+    const q31_t *recipTable;    /**< points to the reciprocal initial value table. */
+          q31_t energy;         /**< saves previous frame energy. */
+          q31_t x0;             /**< saves previous input sample. */
+  } arm_lms_norm_instance_q31;
+
+
+  /**
+   * @brief Processing function for Q31 normalized LMS filter.
+   * @param[in]  S          points to an instance of the Q31 normalized LMS filter structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[in]  pRef       points to the block of reference data.
+   * @param[out] pOut       points to the block of output data.
+   * @param[out] pErr       points to the block of error data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_lms_norm_q31(
+        arm_lms_norm_instance_q31 * S,
+  const q31_t * pSrc,
+        q31_t * pRef,
+        q31_t * pOut,
+        q31_t * pErr,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Initialization function for Q31 normalized LMS filter.
+   * @param[in] S          points to an instance of the Q31 normalized LMS filter structure.
+   * @param[in] numTaps    number of filter coefficients.
+   * @param[in] pCoeffs    points to coefficient buffer.
+   * @param[in] pState     points to state buffer.
+   * @param[in] mu         step size that controls filter coefficient updates.
+   * @param[in] blockSize  number of samples to process.
+   * @param[in] postShift  bit shift applied to coefficients.
+   */
+  void arm_lms_norm_init_q31(
+        arm_lms_norm_instance_q31 * S,
+        uint16_t numTaps,
+        q31_t * pCoeffs,
+        q31_t * pState,
+        q31_t mu,
+        uint32_t blockSize,
+        uint8_t postShift);
+
+
+  /**
+   * @brief Instance structure for the Q15 normalized LMS filter.
+   */
+  typedef struct
+  {
+          uint16_t numTaps;     /**< Number of coefficients in the filter. */
+          q15_t *pState;        /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
+          q15_t *pCoeffs;       /**< points to the coefficient array. The array is of length numTaps. */
+          q15_t mu;             /**< step size that controls filter coefficient updates. */
+          uint8_t postShift;    /**< bit shift applied to coefficients. */
+    const q15_t *recipTable;    /**< Points to the reciprocal initial value table. */
+          q15_t energy;         /**< saves previous frame energy. */
+          q15_t x0;             /**< saves previous input sample. */
+  } arm_lms_norm_instance_q15;
+
+
+  /**
+   * @brief Processing function for Q15 normalized LMS filter.
+   * @param[in]  S          points to an instance of the Q15 normalized LMS filter structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[in]  pRef       points to the block of reference data.
+   * @param[out] pOut       points to the block of output data.
+   * @param[out] pErr       points to the block of error data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_lms_norm_q15(
+        arm_lms_norm_instance_q15 * S,
+  const q15_t * pSrc,
+        q15_t * pRef,
+        q15_t * pOut,
+        q15_t * pErr,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Initialization function for Q15 normalized LMS filter.
+   * @param[in] S          points to an instance of the Q15 normalized LMS filter structure.
+   * @param[in] numTaps    number of filter coefficients.
+   * @param[in] pCoeffs    points to coefficient buffer.
+   * @param[in] pState     points to state buffer.
+   * @param[in] mu         step size that controls filter coefficient updates.
+   * @param[in] blockSize  number of samples to process.
+   * @param[in] postShift  bit shift applied to coefficients.
+   */
+  void arm_lms_norm_init_q15(
+        arm_lms_norm_instance_q15 * S,
+        uint16_t numTaps,
+        q15_t * pCoeffs,
+        q15_t * pState,
+        q15_t mu,
+        uint32_t blockSize,
+        uint8_t postShift);
+
+
+  /**
+   * @brief Correlation of floating-point sequences.
+   * @param[in]  pSrcA    points to the first input sequence.
+   * @param[in]  srcALen  length of the first input sequence.
+   * @param[in]  pSrcB    points to the second input sequence.
+   * @param[in]  srcBLen  length of the second input sequence.
+   * @param[out] pDst     points to the block of output data  Length 2 * max(srcALen, srcBLen) - 1.
+   */
+  void arm_correlate_f32(
+  const float32_t * pSrcA,
+        uint32_t srcALen,
+  const float32_t * pSrcB,
+        uint32_t srcBLen,
+        float32_t * pDst);
+
+
+/**
+ @brief Correlation of Q15 sequences
+ @param[in]  pSrcA     points to the first input sequence
+ @param[in]  srcALen   length of the first input sequence
+ @param[in]  pSrcB     points to the second input sequence
+ @param[in]  srcBLen   length of the second input sequence
+ @param[out] pDst      points to the block of output data  Length 2 * max(srcALen, srcBLen) - 1.
+ @param[in]  pScratch  points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+*/
+void arm_correlate_opt_q15(
+  const q15_t * pSrcA,
+        uint32_t srcALen,
+  const q15_t * pSrcB,
+        uint32_t srcBLen,
+        q15_t * pDst,
+        q15_t * pScratch);
+
+
+/**
+  @brief Correlation of Q15 sequences.
+  @param[in]  pSrcA    points to the first input sequence
+  @param[in]  srcALen  length of the first input sequence
+  @param[in]  pSrcB    points to the second input sequence
+  @param[in]  srcBLen  length of the second input sequence
+  @param[out] pDst     points to the block of output data  Length 2 * max(srcALen, srcBLen) - 1.
+ */
+  void arm_correlate_q15(
+  const q15_t * pSrcA,
+        uint32_t srcALen,
+  const q15_t * pSrcB,
+        uint32_t srcBLen,
+        q15_t * pDst);
+
+
+/**
+  @brief         Correlation of Q15 sequences (fast version).
+  @param[in]     pSrcA      points to the first input sequence
+  @param[in]     srcALen    length of the first input sequence
+  @param[in]     pSrcB      points to the second input sequence
+  @param[in]     srcBLen    length of the second input sequence
+  @param[out]    pDst       points to the location where the output result is written.  Length 2 * max(srcALen, srcBLen) - 1.
+  @return        none
+ */
+void arm_correlate_fast_q15(
+  const q15_t * pSrcA,
+        uint32_t srcALen,
+  const q15_t * pSrcB,
+        uint32_t srcBLen,
+        q15_t * pDst);
+
+
+/**
+  @brief Correlation of Q15 sequences (fast version).
+  @param[in]  pSrcA     points to the first input sequence.
+  @param[in]  srcALen   length of the first input sequence.
+  @param[in]  pSrcB     points to the second input sequence.
+  @param[in]  srcBLen   length of the second input sequence.
+  @param[out] pDst      points to the block of output data  Length 2 * max(srcALen, srcBLen) - 1.
+  @param[in]  pScratch  points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+ */
+void arm_correlate_fast_opt_q15(
+  const q15_t * pSrcA,
+        uint32_t srcALen,
+  const q15_t * pSrcB,
+        uint32_t srcBLen,
+        q15_t * pDst,
+        q15_t * pScratch);
+
+
+  /**
+   * @brief Correlation of Q31 sequences.
+   * @param[in]  pSrcA    points to the first input sequence.
+   * @param[in]  srcALen  length of the first input sequence.
+   * @param[in]  pSrcB    points to the second input sequence.
+   * @param[in]  srcBLen  length of the second input sequence.
+   * @param[out] pDst     points to the block of output data  Length 2 * max(srcALen, srcBLen) - 1.
+   */
+  void arm_correlate_q31(
+  const q31_t * pSrcA,
+        uint32_t srcALen,
+  const q31_t * pSrcB,
+        uint32_t srcBLen,
+        q31_t * pDst);
+
+
+/**
+  @brief Correlation of Q31 sequences (fast version).
+  @param[in]  pSrcA    points to the first input sequence
+  @param[in]  srcALen  length of the first input sequence
+  @param[in]  pSrcB    points to the second input sequence
+  @param[in]  srcBLen  length of the second input sequence
+  @param[out] pDst     points to the block of output data  Length 2 * max(srcALen, srcBLen) - 1.
+ */
+void arm_correlate_fast_q31(
+  const q31_t * pSrcA,
+        uint32_t srcALen,
+  const q31_t * pSrcB,
+        uint32_t srcBLen,
+        q31_t * pDst);
+
+
+ /**
+   * @brief Correlation of Q7 sequences.
+   * @param[in]  pSrcA      points to the first input sequence.
+   * @param[in]  srcALen    length of the first input sequence.
+   * @param[in]  pSrcB      points to the second input sequence.
+   * @param[in]  srcBLen    length of the second input sequence.
+   * @param[out] pDst       points to the block of output data  Length 2 * max(srcALen, srcBLen) - 1.
+   * @param[in]  pScratch1  points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+   * @param[in]  pScratch2  points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).
+   */
+  void arm_correlate_opt_q7(
+  const q7_t * pSrcA,
+        uint32_t srcALen,
+  const q7_t * pSrcB,
+        uint32_t srcBLen,
+        q7_t * pDst,
+        q15_t * pScratch1,
+        q15_t * pScratch2);
+
+
+  /**
+   * @brief Correlation of Q7 sequences.
+   * @param[in]  pSrcA    points to the first input sequence.
+   * @param[in]  srcALen  length of the first input sequence.
+   * @param[in]  pSrcB    points to the second input sequence.
+   * @param[in]  srcBLen  length of the second input sequence.
+   * @param[out] pDst     points to the block of output data  Length 2 * max(srcALen, srcBLen) - 1.
+   */
+  void arm_correlate_q7(
+  const q7_t * pSrcA,
+        uint32_t srcALen,
+  const q7_t * pSrcB,
+        uint32_t srcBLen,
+        q7_t * pDst);
+
+
+  /**
+   * @brief Instance structure for the floating-point sparse FIR filter.
+   */
+  typedef struct
+  {
+          uint16_t numTaps;             /**< number of coefficients in the filter. */
+          uint16_t stateIndex;          /**< state buffer index.  Points to the oldest sample in the state buffer. */
+          float32_t *pState;            /**< points to the state buffer array. The array is of length maxDelay+blockSize-1. */
+    const float32_t *pCoeffs;           /**< points to the coefficient array. The array is of length numTaps.*/
+          uint16_t maxDelay;            /**< maximum offset specified by the pTapDelay array. */
+          int32_t *pTapDelay;           /**< points to the array of delay values.  The array is of length numTaps. */
+  } arm_fir_sparse_instance_f32;
+
+  /**
+   * @brief Instance structure for the Q31 sparse FIR filter.
+   */
+  typedef struct
+  {
+          uint16_t numTaps;             /**< number of coefficients in the filter. */
+          uint16_t stateIndex;          /**< state buffer index.  Points to the oldest sample in the state buffer. */
+          q31_t *pState;                /**< points to the state buffer array. The array is of length maxDelay+blockSize-1. */
+    const q31_t *pCoeffs;               /**< points to the coefficient array. The array is of length numTaps.*/
+          uint16_t maxDelay;            /**< maximum offset specified by the pTapDelay array. */
+          int32_t *pTapDelay;           /**< points to the array of delay values.  The array is of length numTaps. */
+  } arm_fir_sparse_instance_q31;
+
+  /**
+   * @brief Instance structure for the Q15 sparse FIR filter.
+   */
+  typedef struct
+  {
+          uint16_t numTaps;             /**< number of coefficients in the filter. */
+          uint16_t stateIndex;          /**< state buffer index.  Points to the oldest sample in the state buffer. */
+          q15_t *pState;                /**< points to the state buffer array. The array is of length maxDelay+blockSize-1. */
+    const q15_t *pCoeffs;               /**< points to the coefficient array. The array is of length numTaps.*/
+          uint16_t maxDelay;            /**< maximum offset specified by the pTapDelay array. */
+          int32_t *pTapDelay;           /**< points to the array of delay values.  The array is of length numTaps. */
+  } arm_fir_sparse_instance_q15;
+
+  /**
+   * @brief Instance structure for the Q7 sparse FIR filter.
+   */
+  typedef struct
+  {
+          uint16_t numTaps;             /**< number of coefficients in the filter. */
+          uint16_t stateIndex;          /**< state buffer index.  Points to the oldest sample in the state buffer. */
+          q7_t *pState;                 /**< points to the state buffer array. The array is of length maxDelay+blockSize-1. */
+    const q7_t *pCoeffs;                /**< points to the coefficient array. The array is of length numTaps.*/
+          uint16_t maxDelay;            /**< maximum offset specified by the pTapDelay array. */
+          int32_t *pTapDelay;           /**< points to the array of delay values.  The array is of length numTaps. */
+  } arm_fir_sparse_instance_q7;
+
+
+  /**
+   * @brief Processing function for the floating-point sparse FIR filter.
+   * @param[in]  S           points to an instance of the floating-point sparse FIR structure.
+   * @param[in]  pSrc        points to the block of input data.
+   * @param[out] pDst        points to the block of output data
+   * @param[in]  pScratchIn  points to a temporary buffer of size blockSize.
+   * @param[in]  blockSize   number of input samples to process per call.
+   */
+  void arm_fir_sparse_f32(
+        arm_fir_sparse_instance_f32 * S,
+  const float32_t * pSrc,
+        float32_t * pDst,
+        float32_t * pScratchIn,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Initialization function for the floating-point sparse FIR filter.
+   * @param[in,out] S          points to an instance of the floating-point sparse FIR structure.
+   * @param[in]     numTaps    number of nonzero coefficients in the filter.
+   * @param[in]     pCoeffs    points to the array of filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     pTapDelay  points to the array of offset times.
+   * @param[in]     maxDelay   maximum offset time supported.
+   * @param[in]     blockSize  number of samples that will be processed per block.
+   */
+  void arm_fir_sparse_init_f32(
+        arm_fir_sparse_instance_f32 * S,
+        uint16_t numTaps,
+  const float32_t * pCoeffs,
+        float32_t * pState,
+        int32_t * pTapDelay,
+        uint16_t maxDelay,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Processing function for the Q31 sparse FIR filter.
+   * @param[in]  S           points to an instance of the Q31 sparse FIR structure.
+   * @param[in]  pSrc        points to the block of input data.
+   * @param[out] pDst        points to the block of output data
+   * @param[in]  pScratchIn  points to a temporary buffer of size blockSize.
+   * @param[in]  blockSize   number of input samples to process per call.
+   */
+  void arm_fir_sparse_q31(
+        arm_fir_sparse_instance_q31 * S,
+  const q31_t * pSrc,
+        q31_t * pDst,
+        q31_t * pScratchIn,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Initialization function for the Q31 sparse FIR filter.
+   * @param[in,out] S          points to an instance of the Q31 sparse FIR structure.
+   * @param[in]     numTaps    number of nonzero coefficients in the filter.
+   * @param[in]     pCoeffs    points to the array of filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     pTapDelay  points to the array of offset times.
+   * @param[in]     maxDelay   maximum offset time supported.
+   * @param[in]     blockSize  number of samples that will be processed per block.
+   */
+  void arm_fir_sparse_init_q31(
+        arm_fir_sparse_instance_q31 * S,
+        uint16_t numTaps,
+  const q31_t * pCoeffs,
+        q31_t * pState,
+        int32_t * pTapDelay,
+        uint16_t maxDelay,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Processing function for the Q15 sparse FIR filter.
+   * @param[in]  S            points to an instance of the Q15 sparse FIR structure.
+   * @param[in]  pSrc         points to the block of input data.
+   * @param[out] pDst         points to the block of output data
+   * @param[in]  pScratchIn   points to a temporary buffer of size blockSize.
+   * @param[in]  pScratchOut  points to a temporary buffer of size blockSize.
+   * @param[in]  blockSize    number of input samples to process per call.
+   */
+  void arm_fir_sparse_q15(
+        arm_fir_sparse_instance_q15 * S,
+  const q15_t * pSrc,
+        q15_t * pDst,
+        q15_t * pScratchIn,
+        q31_t * pScratchOut,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Initialization function for the Q15 sparse FIR filter.
+   * @param[in,out] S          points to an instance of the Q15 sparse FIR structure.
+   * @param[in]     numTaps    number of nonzero coefficients in the filter.
+   * @param[in]     pCoeffs    points to the array of filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     pTapDelay  points to the array of offset times.
+   * @param[in]     maxDelay   maximum offset time supported.
+   * @param[in]     blockSize  number of samples that will be processed per block.
+   */
+  void arm_fir_sparse_init_q15(
+        arm_fir_sparse_instance_q15 * S,
+        uint16_t numTaps,
+  const q15_t * pCoeffs,
+        q15_t * pState,
+        int32_t * pTapDelay,
+        uint16_t maxDelay,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Processing function for the Q7 sparse FIR filter.
+   * @param[in]  S            points to an instance of the Q7 sparse FIR structure.
+   * @param[in]  pSrc         points to the block of input data.
+   * @param[out] pDst         points to the block of output data
+   * @param[in]  pScratchIn   points to a temporary buffer of size blockSize.
+   * @param[in]  pScratchOut  points to a temporary buffer of size blockSize.
+   * @param[in]  blockSize    number of input samples to process per call.
+   */
+  void arm_fir_sparse_q7(
+        arm_fir_sparse_instance_q7 * S,
+  const q7_t * pSrc,
+        q7_t * pDst,
+        q7_t * pScratchIn,
+        q31_t * pScratchOut,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Initialization function for the Q7 sparse FIR filter.
+   * @param[in,out] S          points to an instance of the Q7 sparse FIR structure.
+   * @param[in]     numTaps    number of nonzero coefficients in the filter.
+   * @param[in]     pCoeffs    points to the array of filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     pTapDelay  points to the array of offset times.
+   * @param[in]     maxDelay   maximum offset time supported.
+   * @param[in]     blockSize  number of samples that will be processed per block.
+   */
+  void arm_fir_sparse_init_q7(
+        arm_fir_sparse_instance_q7 * S,
+        uint16_t numTaps,
+  const q7_t * pCoeffs,
+        q7_t * pState,
+        int32_t * pTapDelay,
+        uint16_t maxDelay,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Floating-point sin_cos function.
+   * @param[in]  theta   input value in degrees
+   * @param[out] pSinVal  points to the processed sine output.
+   * @param[out] pCosVal  points to the processed cos output.
+   */
+  void arm_sin_cos_f32(
+        float32_t theta,
+        float32_t * pSinVal,
+        float32_t * pCosVal);
+
+
+  /**
+   * @brief  Q31 sin_cos function.
+   * @param[in]  theta    scaled input value in degrees
+   * @param[out] pSinVal  points to the processed sine output.
+   * @param[out] pCosVal  points to the processed cosine output.
+   */
+  void arm_sin_cos_q31(
+        q31_t theta,
+        q31_t * pSinVal,
+        q31_t * pCosVal);
+
+
+  /**
+   * @brief  Floating-point complex conjugate.
+   * @param[in]  pSrc        points to the input vector
+   * @param[out] pDst        points to the output vector
+   * @param[in]  numSamples  number of complex samples in each vector
+   */
+  void arm_cmplx_conj_f32(
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t numSamples);
+
+  /**
+   * @brief  Q31 complex conjugate.
+   * @param[in]  pSrc        points to the input vector
+   * @param[out] pDst        points to the output vector
+   * @param[in]  numSamples  number of complex samples in each vector
+   */
+  void arm_cmplx_conj_q31(
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t numSamples);
+
+
+  /**
+   * @brief  Q15 complex conjugate.
+   * @param[in]  pSrc        points to the input vector
+   * @param[out] pDst        points to the output vector
+   * @param[in]  numSamples  number of complex samples in each vector
+   */
+  void arm_cmplx_conj_q15(
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t numSamples);
+
+
+  /**
+   * @brief  Floating-point complex magnitude squared
+   * @param[in]  pSrc        points to the complex input vector
+   * @param[out] pDst        points to the real output vector
+   * @param[in]  numSamples  number of complex samples in the input vector
+   */
+  void arm_cmplx_mag_squared_f32(
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t numSamples);
+
+
+  /**
+   * @brief  Q31 complex magnitude squared
+   * @param[in]  pSrc        points to the complex input vector
+   * @param[out] pDst        points to the real output vector
+   * @param[in]  numSamples  number of complex samples in the input vector
+   */
+  void arm_cmplx_mag_squared_q31(
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t numSamples);
+
+
+  /**
+   * @brief  Q15 complex magnitude squared
+   * @param[in]  pSrc        points to the complex input vector
+   * @param[out] pDst        points to the real output vector
+   * @param[in]  numSamples  number of complex samples in the input vector
+   */
+  void arm_cmplx_mag_squared_q15(
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t numSamples);
+
+
+ /**
+   * @ingroup groupController
+   */
+
+  /**
+   * @defgroup PID PID Motor Control
+   *
+   * A Proportional Integral Derivative (PID) controller is a generic feedback control
+   * loop mechanism widely used in industrial control systems.
+   * A PID controller is the most commonly used type of feedback controller.
+   *
+   * This set of functions implements (PID) controllers
+   * for Q15, Q31, and floating-point data types.  The functions operate on a single sample
+   * of data and each call to the function returns a single processed value.
+   * <code>S</code> points to an instance of the PID control data structure.  <code>in</code>
+   * is the input sample value. The functions return the output value.
+   *
+   * \par Algorithm:
+   * <pre>
+   *    y[n] = y[n-1] + A0 * x[n] + A1 * x[n-1] + A2 * x[n-2]
+   *    A0 = Kp + Ki + Kd
+   *    A1 = (-Kp ) - (2 * Kd )
+   *    A2 = Kd
+   * </pre>
+   *
+   * \par
+   * where \c Kp is proportional constant, \c Ki is Integral constant and \c Kd is Derivative constant
+   *
+   * \par
+   * \image html PID.gif "Proportional Integral Derivative Controller"
+   *
+   * \par
+   * The PID controller calculates an "error" value as the difference between
+   * the measured output and the reference input.
+   * The controller attempts to minimize the error by adjusting the process control inputs.
+   * The proportional value determines the reaction to the current error,
+   * the integral value determines the reaction based on the sum of recent errors,
+   * and the derivative value determines the reaction based on the rate at which the error has been changing.
+   *
+   * \par Instance Structure
+   * The Gains A0, A1, A2 and state variables for a PID controller are stored together in an instance data structure.
+   * A separate instance structure must be defined for each PID Controller.
+   * There are separate instance structure declarations for each of the 3 supported data types.
+   *
+   * \par Reset Functions
+   * There is also an associated reset function for each data type which clears the state array.
+   *
+   * \par Initialization Functions
+   * There is also an associated initialization function for each data type.
+   * The initialization function performs the following operations:
+   * - Initializes the Gains A0, A1, A2 from Kp,Ki, Kd gains.
+   * - Zeros out the values in the state buffer.
+   *
+   * \par
+   * Instance structure cannot be placed into a const data section and it is recommended to use the initialization function.
+   *
+   * \par Fixed-Point Behavior
+   * Care must be taken when using the fixed-point versions of the PID Controller functions.
+   * In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
+   * Refer to the function specific documentation below for usage guidelines.
+   */
+
+  /**
+   * @addtogroup PID
+   * @{
+   */
+
+  /**
+   * @brief         Process function for the floating-point PID Control.
+   * @param[in,out] S   is an instance of the floating-point PID Control structure
+   * @param[in]     in  input sample to process
+   * @return        processed output sample.
+   */
+  __STATIC_FORCEINLINE float32_t arm_pid_f32(
+  arm_pid_instance_f32 * S,
+  float32_t in)
+  {
+    float32_t out;
+
+    /* y[n] = y[n-1] + A0 * x[n] + A1 * x[n-1] + A2 * x[n-2]  */
+    out = (S->A0 * in) +
+      (S->A1 * S->state[0]) + (S->A2 * S->state[1]) + (S->state[2]);
+
+    /* Update state */
+    S->state[1] = S->state[0];
+    S->state[0] = in;
+    S->state[2] = out;
+
+    /* return to application */
+    return (out);
+
+  }
+
+/**
+  @brief         Process function for the Q31 PID Control.
+  @param[in,out] S  points to an instance of the Q31 PID Control structure
+  @param[in]     in  input sample to process
+  @return        processed output sample.
+
+  \par Scaling and Overflow Behavior
+         The function is implemented using an internal 64-bit accumulator.
+         The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
+         Thus, if the accumulator result overflows it wraps around rather than clip.
+         In order to avoid overflows completely the input signal must be scaled down by 2 bits as there are four additions.
+         After all multiply-accumulates are performed, the 2.62 accumulator is truncated to 1.32 format and then saturated to 1.31 format.
+ */
+__STATIC_FORCEINLINE q31_t arm_pid_q31(
+  arm_pid_instance_q31 * S,
+  q31_t in)
+  {
+    q63_t acc;
+    q31_t out;
+
+    /* acc = A0 * x[n]  */
+    acc = (q63_t) S->A0 * in;
+
+    /* acc += A1 * x[n-1] */
+    acc += (q63_t) S->A1 * S->state[0];
+
+    /* acc += A2 * x[n-2]  */
+    acc += (q63_t) S->A2 * S->state[1];
+
+    /* convert output to 1.31 format to add y[n-1] */
+    out = (q31_t) (acc >> 31U);
+
+    /* out += y[n-1] */
+    out += S->state[2];
+
+    /* Update state */
+    S->state[1] = S->state[0];
+    S->state[0] = in;
+    S->state[2] = out;
+
+    /* return to application */
+    return (out);
+  }
+
+
+/**
+  @brief         Process function for the Q15 PID Control.
+  @param[in,out] S   points to an instance of the Q15 PID Control structure
+  @param[in]     in  input sample to process
+  @return        processed output sample.
+
+  \par Scaling and Overflow Behavior
+         The function is implemented using a 64-bit internal accumulator.
+         Both Gains and state variables are represented in 1.15 format and multiplications yield a 2.30 result.
+         The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
+         There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
+         After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits.
+         Lastly, the accumulator is saturated to yield a result in 1.15 format.
+ */
+__STATIC_FORCEINLINE q15_t arm_pid_q15(
+  arm_pid_instance_q15 * S,
+  q15_t in)
+  {
+    q63_t acc;
+    q15_t out;
+
+#if defined (ARM_MATH_DSP)
+    /* Implementation of PID controller */
+
+    /* acc = A0 * x[n]  */
+    acc = (q31_t) __SMUAD((uint32_t)S->A0, (uint32_t)in);
+
+    /* acc += A1 * x[n-1] + A2 * x[n-2]  */
+    acc = (q63_t)__SMLALD((uint32_t)S->A1, (uint32_t)read_q15x2 (S->state), (uint64_t)acc);
+#else
+    /* acc = A0 * x[n]  */
+    acc = ((q31_t) S->A0) * in;
+
+    /* acc += A1 * x[n-1] + A2 * x[n-2]  */
+    acc += (q31_t) S->A1 * S->state[0];
+    acc += (q31_t) S->A2 * S->state[1];
+#endif
+
+    /* acc += y[n-1] */
+    acc += (q31_t) S->state[2] << 15;
+
+    /* saturate the output */
+    out = (q15_t) (__SSAT((q31_t)(acc >> 15), 16));
+
+    /* Update state */
+    S->state[1] = S->state[0];
+    S->state[0] = in;
+    S->state[2] = out;
+
+    /* return to application */
+    return (out);
+  }
+
+  /**
+   * @} end of PID group
+   */
+
+
+  /**
+   * @brief Floating-point matrix inverse.
+   * @param[in]  src   points to the instance of the input floating-point matrix structure.
+   * @param[out] dst   points to the instance of the output floating-point matrix structure.
+   * @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match.
+   * If the input matrix is singular (does not have an inverse), then the algorithm terminates and returns error status ARM_MATH_SINGULAR.
+   */
+  arm_status arm_mat_inverse_f32(
+  const arm_matrix_instance_f32 * src,
+  arm_matrix_instance_f32 * dst);
+
+
+  /**
+   * @brief Floating-point matrix inverse.
+   * @param[in]  src   points to the instance of the input floating-point matrix structure.
+   * @param[out] dst   points to the instance of the output floating-point matrix structure.
+   * @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match.
+   * If the input matrix is singular (does not have an inverse), then the algorithm terminates and returns error status ARM_MATH_SINGULAR.
+   */
+  arm_status arm_mat_inverse_f64(
+  const arm_matrix_instance_f64 * src,
+  arm_matrix_instance_f64 * dst);
+
+
+
+  /**
+   * @ingroup groupController
+   */
+
+  /**
+   * @defgroup clarke Vector Clarke Transform
+   * Forward Clarke transform converts the instantaneous stator phases into a two-coordinate time invariant vector.
+   * Generally the Clarke transform uses three-phase currents <code>Ia, Ib and Ic</code> to calculate currents
+   * in the two-phase orthogonal stator axis <code>Ialpha</code> and <code>Ibeta</code>.
+   * When <code>Ialpha</code> is superposed with <code>Ia</code> as shown in the figure below
+   * \image html clarke.gif Stator current space vector and its components in (a,b).
+   * and <code>Ia + Ib + Ic = 0</code>, in this condition <code>Ialpha</code> and <code>Ibeta</code>
+   * can be calculated using only <code>Ia</code> and <code>Ib</code>.
+   *
+   * The function operates on a single sample of data and each call to the function returns the processed output.
+   * The library provides separate functions for Q31 and floating-point data types.
+   * \par Algorithm
+   * \image html clarkeFormula.gif
+   * where <code>Ia</code> and <code>Ib</code> are the instantaneous stator phases and
+   * <code>pIalpha</code> and <code>pIbeta</code> are the two coordinates of time invariant vector.
+   * \par Fixed-Point Behavior
+   * Care must be taken when using the Q31 version of the Clarke transform.
+   * In particular, the overflow and saturation behavior of the accumulator used must be considered.
+   * Refer to the function specific documentation below for usage guidelines.
+   */
+
+  /**
+   * @addtogroup clarke
+   * @{
+   */
+
+  /**
+   *
+   * @brief  Floating-point Clarke transform
+   * @param[in]  Ia       input three-phase coordinate <code>a</code>
+   * @param[in]  Ib       input three-phase coordinate <code>b</code>
+   * @param[out] pIalpha  points to output two-phase orthogonal vector axis alpha
+   * @param[out] pIbeta   points to output two-phase orthogonal vector axis beta
+   * @return        none
+   */
+  __STATIC_FORCEINLINE void arm_clarke_f32(
+  float32_t Ia,
+  float32_t Ib,
+  float32_t * pIalpha,
+  float32_t * pIbeta)
+  {
+    /* Calculate pIalpha using the equation, pIalpha = Ia */
+    *pIalpha = Ia;
+
+    /* Calculate pIbeta using the equation, pIbeta = (1/sqrt(3)) * Ia + (2/sqrt(3)) * Ib */
+    *pIbeta = ((float32_t) 0.57735026919 * Ia + (float32_t) 1.15470053838 * Ib);
+  }
+
+
+/**
+  @brief  Clarke transform for Q31 version
+  @param[in]  Ia       input three-phase coordinate <code>a</code>
+  @param[in]  Ib       input three-phase coordinate <code>b</code>
+  @param[out] pIalpha  points to output two-phase orthogonal vector axis alpha
+  @param[out] pIbeta   points to output two-phase orthogonal vector axis beta
+  @return     none
+
+  \par Scaling and Overflow Behavior
+         The function is implemented using an internal 32-bit accumulator.
+         The accumulator maintains 1.31 format by truncating lower 31 bits of the intermediate multiplication in 2.62 format.
+         There is saturation on the addition, hence there is no risk of overflow.
+ */
+__STATIC_FORCEINLINE void arm_clarke_q31(
+  q31_t Ia,
+  q31_t Ib,
+  q31_t * pIalpha,
+  q31_t * pIbeta)
+  {
+    q31_t product1, product2;                    /* Temporary variables used to store intermediate results */
+
+    /* Calculating pIalpha from Ia by equation pIalpha = Ia */
+    *pIalpha = Ia;
+
+    /* Intermediate product is calculated by (1/(sqrt(3)) * Ia) */
+    product1 = (q31_t) (((q63_t) Ia * 0x24F34E8B) >> 30);
+
+    /* Intermediate product is calculated by (2/sqrt(3) * Ib) */
+    product2 = (q31_t) (((q63_t) Ib * 0x49E69D16) >> 30);
+
+    /* pIbeta is calculated by adding the intermediate products */
+    *pIbeta = __QADD(product1, product2);
+  }
+
+  /**
+   * @} end of clarke group
+   */
+
+
+  /**
+   * @ingroup groupController
+   */
+
+  /**
+   * @defgroup inv_clarke Vector Inverse Clarke Transform
+   * Inverse Clarke transform converts the two-coordinate time invariant vector into instantaneous stator phases.
+   *
+   * The function operates on a single sample of data and each call to the function returns the processed output.
+   * The library provides separate functions for Q31 and floating-point data types.
+   * \par Algorithm
+   * \image html clarkeInvFormula.gif
+   * where <code>pIa</code> and <code>pIb</code> are the instantaneous stator phases and
+   * <code>Ialpha</code> and <code>Ibeta</code> are the two coordinates of time invariant vector.
+   * \par Fixed-Point Behavior
+   * Care must be taken when using the Q31 version of the Clarke transform.
+   * In particular, the overflow and saturation behavior of the accumulator used must be considered.
+   * Refer to the function specific documentation below for usage guidelines.
+   */
+
+  /**
+   * @addtogroup inv_clarke
+   * @{
+   */
+
+   /**
+   * @brief  Floating-point Inverse Clarke transform
+   * @param[in]  Ialpha  input two-phase orthogonal vector axis alpha
+   * @param[in]  Ibeta   input two-phase orthogonal vector axis beta
+   * @param[out] pIa     points to output three-phase coordinate <code>a</code>
+   * @param[out] pIb     points to output three-phase coordinate <code>b</code>
+   * @return     none
+   */
+  __STATIC_FORCEINLINE void arm_inv_clarke_f32(
+  float32_t Ialpha,
+  float32_t Ibeta,
+  float32_t * pIa,
+  float32_t * pIb)
+  {
+    /* Calculating pIa from Ialpha by equation pIa = Ialpha */
+    *pIa = Ialpha;
+
+    /* Calculating pIb from Ialpha and Ibeta by equation pIb = -(1/2) * Ialpha + (sqrt(3)/2) * Ibeta */
+    *pIb = -0.5f * Ialpha + 0.8660254039f * Ibeta;
+  }
+
+
+/**
+  @brief  Inverse Clarke transform for Q31 version
+  @param[in]  Ialpha  input two-phase orthogonal vector axis alpha
+  @param[in]  Ibeta   input two-phase orthogonal vector axis beta
+  @param[out] pIa     points to output three-phase coordinate <code>a</code>
+  @param[out] pIb     points to output three-phase coordinate <code>b</code>
+  @return     none
+
+  \par Scaling and Overflow Behavior
+         The function is implemented using an internal 32-bit accumulator.
+         The accumulator maintains 1.31 format by truncating lower 31 bits of the intermediate multiplication in 2.62 format.
+         There is saturation on the subtraction, hence there is no risk of overflow.
+ */
+__STATIC_FORCEINLINE void arm_inv_clarke_q31(
+  q31_t Ialpha,
+  q31_t Ibeta,
+  q31_t * pIa,
+  q31_t * pIb)
+  {
+    q31_t product1, product2;                    /* Temporary variables used to store intermediate results */
+
+    /* Calculating pIa from Ialpha by equation pIa = Ialpha */
+    *pIa = Ialpha;
+
+    /* Intermediate product is calculated by (1/(2*sqrt(3)) * Ia) */
+    product1 = (q31_t) (((q63_t) (Ialpha) * (0x40000000)) >> 31);
+
+    /* Intermediate product is calculated by (1/sqrt(3) * pIb) */
+    product2 = (q31_t) (((q63_t) (Ibeta) * (0x6ED9EBA1)) >> 31);
+
+    /* pIb is calculated by subtracting the products */
+    *pIb = __QSUB(product2, product1);
+  }
+
+  /**
+   * @} end of inv_clarke group
+   */
+
+
+
+  /**
+   * @ingroup groupController
+   */
+
+  /**
+   * @defgroup park Vector Park Transform
+   *
+   * Forward Park transform converts the input two-coordinate vector to flux and torque components.
+   * The Park transform can be used to realize the transformation of the <code>Ialpha</code> and the <code>Ibeta</code> currents
+   * from the stationary to the moving reference frame and control the spatial relationship between
+   * the stator vector current and rotor flux vector.
+   * If we consider the d axis aligned with the rotor flux, the diagram below shows the
+   * current vector and the relationship from the two reference frames:
+   * \image html park.gif "Stator current space vector and its component in (a,b) and in the d,q rotating reference frame"
+   *
+   * The function operates on a single sample of data and each call to the function returns the processed output.
+   * The library provides separate functions for Q31 and floating-point data types.
+   * \par Algorithm
+   * \image html parkFormula.gif
+   * where <code>Ialpha</code> and <code>Ibeta</code> are the stator vector components,
+   * <code>pId</code> and <code>pIq</code> are rotor vector components and <code>cosVal</code> and <code>sinVal</code> are the
+   * cosine and sine values of theta (rotor flux position).
+   * \par Fixed-Point Behavior
+   * Care must be taken when using the Q31 version of the Park transform.
+   * In particular, the overflow and saturation behavior of the accumulator used must be considered.
+   * Refer to the function specific documentation below for usage guidelines.
+   */
+
+  /**
+   * @addtogroup park
+   * @{
+   */
+
+  /**
+   * @brief Floating-point Park transform
+   * @param[in]  Ialpha  input two-phase vector coordinate alpha
+   * @param[in]  Ibeta   input two-phase vector coordinate beta
+   * @param[out] pId     points to output   rotor reference frame d
+   * @param[out] pIq     points to output   rotor reference frame q
+   * @param[in]  sinVal  sine value of rotation angle theta
+   * @param[in]  cosVal  cosine value of rotation angle theta
+   * @return     none
+   *
+   * The function implements the forward Park transform.
+   *
+   */
+  __STATIC_FORCEINLINE void arm_park_f32(
+  float32_t Ialpha,
+  float32_t Ibeta,
+  float32_t * pId,
+  float32_t * pIq,
+  float32_t sinVal,
+  float32_t cosVal)
+  {
+    /* Calculate pId using the equation, pId = Ialpha * cosVal + Ibeta * sinVal */
+    *pId = Ialpha * cosVal + Ibeta * sinVal;
+
+    /* Calculate pIq using the equation, pIq = - Ialpha * sinVal + Ibeta * cosVal */
+    *pIq = -Ialpha * sinVal + Ibeta * cosVal;
+  }
+
+
+/**
+  @brief  Park transform for Q31 version
+  @param[in]  Ialpha  input two-phase vector coordinate alpha
+  @param[in]  Ibeta   input two-phase vector coordinate beta
+  @param[out] pId     points to output rotor reference frame d
+  @param[out] pIq     points to output rotor reference frame q
+  @param[in]  sinVal  sine value of rotation angle theta
+  @param[in]  cosVal  cosine value of rotation angle theta
+  @return     none
+
+  \par Scaling and Overflow Behavior
+         The function is implemented using an internal 32-bit accumulator.
+         The accumulator maintains 1.31 format by truncating lower 31 bits of the intermediate multiplication in 2.62 format.
+         There is saturation on the addition and subtraction, hence there is no risk of overflow.
+ */
+__STATIC_FORCEINLINE void arm_park_q31(
+  q31_t Ialpha,
+  q31_t Ibeta,
+  q31_t * pId,
+  q31_t * pIq,
+  q31_t sinVal,
+  q31_t cosVal)
+  {
+    q31_t product1, product2;                    /* Temporary variables used to store intermediate results */
+    q31_t product3, product4;                    /* Temporary variables used to store intermediate results */
+
+    /* Intermediate product is calculated by (Ialpha * cosVal) */
+    product1 = (q31_t) (((q63_t) (Ialpha) * (cosVal)) >> 31);
+
+    /* Intermediate product is calculated by (Ibeta * sinVal) */
+    product2 = (q31_t) (((q63_t) (Ibeta) * (sinVal)) >> 31);
+
+
+    /* Intermediate product is calculated by (Ialpha * sinVal) */
+    product3 = (q31_t) (((q63_t) (Ialpha) * (sinVal)) >> 31);
+
+    /* Intermediate product is calculated by (Ibeta * cosVal) */
+    product4 = (q31_t) (((q63_t) (Ibeta) * (cosVal)) >> 31);
+
+    /* Calculate pId by adding the two intermediate products 1 and 2 */
+    *pId = __QADD(product1, product2);
+
+    /* Calculate pIq by subtracting the two intermediate products 3 from 4 */
+    *pIq = __QSUB(product4, product3);
+  }
+
+  /**
+   * @} end of park group
+   */
+
+
+  /**
+   * @ingroup groupController
+   */
+
+  /**
+   * @defgroup inv_park Vector Inverse Park transform
+   * Inverse Park transform converts the input flux and torque components to two-coordinate vector.
+   *
+   * The function operates on a single sample of data and each call to the function returns the processed output.
+   * The library provides separate functions for Q31 and floating-point data types.
+   * \par Algorithm
+   * \image html parkInvFormula.gif
+   * where <code>pIalpha</code> and <code>pIbeta</code> are the stator vector components,
+   * <code>Id</code> and <code>Iq</code> are rotor vector components and <code>cosVal</code> and <code>sinVal</code> are the
+   * cosine and sine values of theta (rotor flux position).
+   * \par Fixed-Point Behavior
+   * Care must be taken when using the Q31 version of the Park transform.
+   * In particular, the overflow and saturation behavior of the accumulator used must be considered.
+   * Refer to the function specific documentation below for usage guidelines.
+   */
+
+  /**
+   * @addtogroup inv_park
+   * @{
+   */
+
+   /**
+   * @brief  Floating-point Inverse Park transform
+   * @param[in]  Id       input coordinate of rotor reference frame d
+   * @param[in]  Iq       input coordinate of rotor reference frame q
+   * @param[out] pIalpha  points to output two-phase orthogonal vector axis alpha
+   * @param[out] pIbeta   points to output two-phase orthogonal vector axis beta
+   * @param[in]  sinVal   sine value of rotation angle theta
+   * @param[in]  cosVal   cosine value of rotation angle theta
+   * @return     none
+   */
+  __STATIC_FORCEINLINE void arm_inv_park_f32(
+  float32_t Id,
+  float32_t Iq,
+  float32_t * pIalpha,
+  float32_t * pIbeta,
+  float32_t sinVal,
+  float32_t cosVal)
+  {
+    /* Calculate pIalpha using the equation, pIalpha = Id * cosVal - Iq * sinVal */
+    *pIalpha = Id * cosVal - Iq * sinVal;
+
+    /* Calculate pIbeta using the equation, pIbeta = Id * sinVal + Iq * cosVal */
+    *pIbeta = Id * sinVal + Iq * cosVal;
+  }
+
+
+/**
+  @brief  Inverse Park transform for   Q31 version
+  @param[in]  Id       input coordinate of rotor reference frame d
+  @param[in]  Iq       input coordinate of rotor reference frame q
+  @param[out] pIalpha  points to output two-phase orthogonal vector axis alpha
+  @param[out] pIbeta   points to output two-phase orthogonal vector axis beta
+  @param[in]  sinVal   sine value of rotation angle theta
+  @param[in]  cosVal   cosine value of rotation angle theta
+  @return     none
+
+  @par Scaling and Overflow Behavior
+         The function is implemented using an internal 32-bit accumulator.
+         The accumulator maintains 1.31 format by truncating lower 31 bits of the intermediate multiplication in 2.62 format.
+         There is saturation on the addition, hence there is no risk of overflow.
+ */
+__STATIC_FORCEINLINE void arm_inv_park_q31(
+  q31_t Id,
+  q31_t Iq,
+  q31_t * pIalpha,
+  q31_t * pIbeta,
+  q31_t sinVal,
+  q31_t cosVal)
+  {
+    q31_t product1, product2;                    /* Temporary variables used to store intermediate results */
+    q31_t product3, product4;                    /* Temporary variables used to store intermediate results */
+
+    /* Intermediate product is calculated by (Id * cosVal) */
+    product1 = (q31_t) (((q63_t) (Id) * (cosVal)) >> 31);
+
+    /* Intermediate product is calculated by (Iq * sinVal) */
+    product2 = (q31_t) (((q63_t) (Iq) * (sinVal)) >> 31);
+
+
+    /* Intermediate product is calculated by (Id * sinVal) */
+    product3 = (q31_t) (((q63_t) (Id) * (sinVal)) >> 31);
+
+    /* Intermediate product is calculated by (Iq * cosVal) */
+    product4 = (q31_t) (((q63_t) (Iq) * (cosVal)) >> 31);
+
+    /* Calculate pIalpha by using the two intermediate products 1 and 2 */
+    *pIalpha = __QSUB(product1, product2);
+
+    /* Calculate pIbeta by using the two intermediate products 3 and 4 */
+    *pIbeta = __QADD(product4, product3);
+  }
+
+  /**
+   * @} end of Inverse park group
+   */
+
+
+  /**
+   * @ingroup groupInterpolation
+   */
+
+  /**
+   * @defgroup LinearInterpolate Linear Interpolation
+   *
+   * Linear interpolation is a method of curve fitting using linear polynomials.
+   * Linear interpolation works by effectively drawing a straight line between two neighboring samples and returning the appropriate point along that line
+   *
+   * \par
+   * \image html LinearInterp.gif "Linear interpolation"
+   *
+   * \par
+   * A  Linear Interpolate function calculates an output value(y), for the input(x)
+   * using linear interpolation of the input values x0, x1( nearest input values) and the output values y0 and y1(nearest output values)
+   *
+   * \par Algorithm:
+   * <pre>
+   *       y = y0 + (x - x0) * ((y1 - y0)/(x1-x0))
+   *       where x0, x1 are nearest values of input x
+   *             y0, y1 are nearest values to output y
+   * </pre>
+   *
+   * \par
+   * This set of functions implements Linear interpolation process
+   * for Q7, Q15, Q31, and floating-point data types.  The functions operate on a single
+   * sample of data and each call to the function returns a single processed value.
+   * <code>S</code> points to an instance of the Linear Interpolate function data structure.
+   * <code>x</code> is the input sample value. The functions returns the output value.
+   *
+   * \par
+   * if x is outside of the table boundary, Linear interpolation returns first value of the table
+   * if x is below input range and returns last value of table if x is above range.
+   */
+
+  /**
+   * @addtogroup LinearInterpolate
+   * @{
+   */
+
+  /**
+   * @brief  Process function for the floating-point Linear Interpolation Function.
+   * @param[in,out] S  is an instance of the floating-point Linear Interpolation structure
+   * @param[in]     x  input sample to process
+   * @return y processed output sample.
+   *
+   */
+  __STATIC_FORCEINLINE float32_t arm_linear_interp_f32(
+  arm_linear_interp_instance_f32 * S,
+  float32_t x)
+  {
+    float32_t y;
+    float32_t x0, x1;                            /* Nearest input values */
+    float32_t y0, y1;                            /* Nearest output values */
+    float32_t xSpacing = S->xSpacing;            /* spacing between input values */
+    int32_t i;                                   /* Index variable */
+    float32_t *pYData = S->pYData;               /* pointer to output table */
+
+    /* Calculation of index */
+    i = (int32_t) ((x - S->x1) / xSpacing);
+
+    if (i < 0)
+    {
+      /* Iniatilize output for below specified range as least output value of table */
+      y = pYData[0];
+    }
+    else if ((uint32_t)i >= (S->nValues - 1))
+    {
+      /* Iniatilize output for above specified range as last output value of table */
+      y = pYData[S->nValues - 1];
+    }
+    else
+    {
+      /* Calculation of nearest input values */
+      x0 = S->x1 +  i      * xSpacing;
+      x1 = S->x1 + (i + 1) * xSpacing;
+
+      /* Read of nearest output values */
+      y0 = pYData[i];
+      y1 = pYData[i + 1];
+
+      /* Calculation of output */
+      y = y0 + (x - x0) * ((y1 - y0) / (x1 - x0));
+
+    }
+
+    /* returns output value */
+    return (y);
+  }
+
+
+   /**
+   *
+   * @brief  Process function for the Q31 Linear Interpolation Function.
+   * @param[in] pYData   pointer to Q31 Linear Interpolation table
+   * @param[in] x        input sample to process
+   * @param[in] nValues  number of table values
+   * @return y processed output sample.
+   *
+   * \par
+   * Input sample <code>x</code> is in 12.20 format which contains 12 bits for table index and 20 bits for fractional part.
+   * This function can support maximum of table size 2^12.
+   *
+   */
+  __STATIC_FORCEINLINE q31_t arm_linear_interp_q31(
+  q31_t * pYData,
+  q31_t x,
+  uint32_t nValues)
+  {
+    q31_t y;                                     /* output */
+    q31_t y0, y1;                                /* Nearest output values */
+    q31_t fract;                                 /* fractional part */
+    int32_t index;                               /* Index to read nearest output values */
+
+    /* Input is in 12.20 format */
+    /* 12 bits for the table index */
+    /* Index value calculation */
+    index = ((x & (q31_t)0xFFF00000) >> 20);
+
+    if (index >= (int32_t)(nValues - 1))
+    {
+      return (pYData[nValues - 1]);
+    }
+    else if (index < 0)
+    {
+      return (pYData[0]);
+    }
+    else
+    {
+      /* 20 bits for the fractional part */
+      /* shift left by 11 to keep fract in 1.31 format */
+      fract = (x & 0x000FFFFF) << 11;
+
+      /* Read two nearest output values from the index in 1.31(q31) format */
+      y0 = pYData[index];
+      y1 = pYData[index + 1];
+
+      /* Calculation of y0 * (1-fract) and y is in 2.30 format */
+      y = ((q31_t) ((q63_t) y0 * (0x7FFFFFFF - fract) >> 32));
+
+      /* Calculation of y0 * (1-fract) + y1 *fract and y is in 2.30 format */
+      y += ((q31_t) (((q63_t) y1 * fract) >> 32));
+
+      /* Convert y to 1.31 format */
+      return (y << 1U);
+    }
+  }
+
+
+  /**
+   *
+   * @brief  Process function for the Q15 Linear Interpolation Function.
+   * @param[in] pYData   pointer to Q15 Linear Interpolation table
+   * @param[in] x        input sample to process
+   * @param[in] nValues  number of table values
+   * @return y processed output sample.
+   *
+   * \par
+   * Input sample <code>x</code> is in 12.20 format which contains 12 bits for table index and 20 bits for fractional part.
+   * This function can support maximum of table size 2^12.
+   *
+   */
+  __STATIC_FORCEINLINE q15_t arm_linear_interp_q15(
+  q15_t * pYData,
+  q31_t x,
+  uint32_t nValues)
+  {
+    q63_t y;                                     /* output */
+    q15_t y0, y1;                                /* Nearest output values */
+    q31_t fract;                                 /* fractional part */
+    int32_t index;                               /* Index to read nearest output values */
+
+    /* Input is in 12.20 format */
+    /* 12 bits for the table index */
+    /* Index value calculation */
+    index = ((x & (int32_t)0xFFF00000) >> 20);
+
+    if (index >= (int32_t)(nValues - 1))
+    {
+      return (pYData[nValues - 1]);
+    }
+    else if (index < 0)
+    {
+      return (pYData[0]);
+    }
+    else
+    {
+      /* 20 bits for the fractional part */
+      /* fract is in 12.20 format */
+      fract = (x & 0x000FFFFF);
+
+      /* Read two nearest output values from the index */
+      y0 = pYData[index];
+      y1 = pYData[index + 1];
+
+      /* Calculation of y0 * (1-fract) and y is in 13.35 format */
+      y = ((q63_t) y0 * (0xFFFFF - fract));
+
+      /* Calculation of (y0 * (1-fract) + y1 * fract) and y is in 13.35 format */
+      y += ((q63_t) y1 * (fract));
+
+      /* convert y to 1.15 format */
+      return (q15_t) (y >> 20);
+    }
+  }
+
+
+  /**
+   *
+   * @brief  Process function for the Q7 Linear Interpolation Function.
+   * @param[in] pYData   pointer to Q7 Linear Interpolation table
+   * @param[in] x        input sample to process
+   * @param[in] nValues  number of table values
+   * @return y processed output sample.
+   *
+   * \par
+   * Input sample <code>x</code> is in 12.20 format which contains 12 bits for table index and 20 bits for fractional part.
+   * This function can support maximum of table size 2^12.
+   */
+  __STATIC_FORCEINLINE q7_t arm_linear_interp_q7(
+  q7_t * pYData,
+  q31_t x,
+  uint32_t nValues)
+  {
+    q31_t y;                                     /* output */
+    q7_t y0, y1;                                 /* Nearest output values */
+    q31_t fract;                                 /* fractional part */
+    uint32_t index;                              /* Index to read nearest output values */
+
+    /* Input is in 12.20 format */
+    /* 12 bits for the table index */
+    /* Index value calculation */
+    if (x < 0)
+    {
+      return (pYData[0]);
+    }
+    index = (x >> 20) & 0xfff;
+
+    if (index >= (nValues - 1))
+    {
+      return (pYData[nValues - 1]);
+    }
+    else
+    {
+      /* 20 bits for the fractional part */
+      /* fract is in 12.20 format */
+      fract = (x & 0x000FFFFF);
+
+      /* Read two nearest output values from the index and are in 1.7(q7) format */
+      y0 = pYData[index];
+      y1 = pYData[index + 1];
+
+      /* Calculation of y0 * (1-fract ) and y is in 13.27(q27) format */
+      y = ((y0 * (0xFFFFF - fract)));
+
+      /* Calculation of y1 * fract + y0 * (1-fract) and y is in 13.27(q27) format */
+      y += (y1 * fract);
+
+      /* convert y to 1.7(q7) format */
+      return (q7_t) (y >> 20);
+     }
+  }
+
+  /**
+   * @} end of LinearInterpolate group
+   */
+
+  /**
+   * @brief  Fast approximation to the trigonometric sine function for floating-point data.
+   * @param[in] x  input value in radians.
+   * @return  sin(x).
+   */
+  float32_t arm_sin_f32(
+  float32_t x);
+
+
+  /**
+   * @brief  Fast approximation to the trigonometric sine function for Q31 data.
+   * @param[in] x  Scaled input value in radians.
+   * @return  sin(x).
+   */
+  q31_t arm_sin_q31(
+  q31_t x);
+
+
+  /**
+   * @brief  Fast approximation to the trigonometric sine function for Q15 data.
+   * @param[in] x  Scaled input value in radians.
+   * @return  sin(x).
+   */
+  q15_t arm_sin_q15(
+  q15_t x);
+
+
+  /**
+   * @brief  Fast approximation to the trigonometric cosine function for floating-point data.
+   * @param[in] x  input value in radians.
+   * @return  cos(x).
+   */
+  float32_t arm_cos_f32(
+  float32_t x);
+
+
+  /**
+   * @brief Fast approximation to the trigonometric cosine function for Q31 data.
+   * @param[in] x  Scaled input value in radians.
+   * @return  cos(x).
+   */
+  q31_t arm_cos_q31(
+  q31_t x);
+
+
+  /**
+   * @brief  Fast approximation to the trigonometric cosine function for Q15 data.
+   * @param[in] x  Scaled input value in radians.
+   * @return  cos(x).
+   */
+  q15_t arm_cos_q15(
+  q15_t x);
+
+
+/**
+  @brief         Floating-point vector of log values.
+  @param[in]     pSrc       points to the input vector
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+  void arm_vlog_f32(
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+/**
+  @brief         Floating-point vector of exp values.
+  @param[in]     pSrc       points to the input vector
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+  void arm_vexp_f32(
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+  /**
+   * @ingroup groupFastMath
+   */
+
+
+  /**
+   * @defgroup SQRT Square Root
+   *
+   * Computes the square root of a number.
+   * There are separate functions for Q15, Q31, and floating-point data types.
+   * The square root function is computed using the Newton-Raphson algorithm.
+   * This is an iterative algorithm of the form:
+   * <pre>
+   *      x1 = x0 - f(x0)/f'(x0)
+   * </pre>
+   * where <code>x1</code> is the current estimate,
+   * <code>x0</code> is the previous estimate, and
+   * <code>f'(x0)</code> is the derivative of <code>f()</code> evaluated at <code>x0</code>.
+   * For the square root function, the algorithm reduces to:
+   * <pre>
+   *     x0 = in/2                         [initial guess]
+   *     x1 = 1/2 * ( x0 + in / x0)        [each iteration]
+   * </pre>
+   */
+
+
+  /**
+   * @addtogroup SQRT
+   * @{
+   */
+
+/**
+  @brief         Floating-point square root function.
+  @param[in]     in    input value
+  @param[out]    pOut  square root of input value
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS        : input value is positive
+                   - \ref ARM_MATH_ARGUMENT_ERROR : input value is negative; *pOut is set to 0
+ */
+__STATIC_FORCEINLINE arm_status arm_sqrt_f32(
+  float32_t in,
+  float32_t * pOut)
+  {
+    if (in >= 0.0f)
+    {
+#if defined ( __CC_ARM )
+  #if defined __TARGET_FPU_VFP
+      *pOut = __sqrtf(in);
+  #else
+      *pOut = sqrtf(in);
+  #endif
+
+#elif defined ( __ICCARM__ )
+  #if defined __ARMVFP__
+      __ASM("VSQRT.F32 %0,%1" : "=t"(*pOut) : "t"(in));
+  #else
+      *pOut = sqrtf(in);
+  #endif
+
+#else
+      *pOut = sqrtf(in);
+#endif
+
+      return (ARM_MATH_SUCCESS);
+    }
+    else
+    {
+      *pOut = 0.0f;
+      return (ARM_MATH_ARGUMENT_ERROR);
+    }
+  }
+
+
+/**
+  @brief         Q31 square root function.
+  @param[in]     in    input value.  The range of the input value is [0 +1) or 0x00000000 to 0x7FFFFFFF
+  @param[out]    pOut  points to square root of input value
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS        : input value is positive
+                   - \ref ARM_MATH_ARGUMENT_ERROR : input value is negative; *pOut is set to 0
+ */
+arm_status arm_sqrt_q31(
+  q31_t in,
+  q31_t * pOut);
+
+
+/**
+  @brief         Q15 square root function.
+  @param[in]     in    input value.  The range of the input value is [0 +1) or 0x0000 to 0x7FFF
+  @param[out]    pOut  points to square root of input value
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS        : input value is positive
+                   - \ref ARM_MATH_ARGUMENT_ERROR : input value is negative; *pOut is set to 0
+ */
+arm_status arm_sqrt_q15(
+  q15_t in,
+  q15_t * pOut);
+
+  /**
+   * @brief  Vector Floating-point square root function.
+   * @param[in]  pIn   input vector.
+   * @param[out] pOut  vector of square roots of input elements.
+   * @param[in]  len   length of input vector.
+   * @return The function returns ARM_MATH_SUCCESS if input value is positive value or ARM_MATH_ARGUMENT_ERROR if
+   * <code>in</code> is negative value and returns zero output for negative values.
+   */
+  void arm_vsqrt_f32(
+  float32_t * pIn,
+  float32_t * pOut,
+  uint16_t len);
+
+  void arm_vsqrt_q31(
+  q31_t * pIn,
+  q31_t * pOut,
+  uint16_t len);
+
+  void arm_vsqrt_q15(
+  q15_t * pIn,
+  q15_t * pOut,
+  uint16_t len);
+
+  /**
+   * @} end of SQRT group
+   */
+
+
+  /**
+   * @brief floating-point Circular write function.
+   */
+  __STATIC_FORCEINLINE void arm_circularWrite_f32(
+  int32_t * circBuffer,
+  int32_t L,
+  uint16_t * writeOffset,
+  int32_t bufferInc,
+  const int32_t * src,
+  int32_t srcInc,
+  uint32_t blockSize)
+  {
+    uint32_t i = 0U;
+    int32_t wOffset;
+
+    /* Copy the value of Index pointer that points
+     * to the current location where the input samples to be copied */
+    wOffset = *writeOffset;
+
+    /* Loop over the blockSize */
+    i = blockSize;
+
+    while (i > 0U)
+    {
+      /* copy the input sample to the circular buffer */
+      circBuffer[wOffset] = *src;
+
+      /* Update the input pointer */
+      src += srcInc;
+
+      /* Circularly update wOffset.  Watch out for positive and negative value */
+      wOffset += bufferInc;
+      if (wOffset >= L)
+        wOffset -= L;
+
+      /* Decrement the loop counter */
+      i--;
+    }
+
+    /* Update the index pointer */
+    *writeOffset = (uint16_t)wOffset;
+  }
+
+
+
+  /**
+   * @brief floating-point Circular Read function.
+   */
+  __STATIC_FORCEINLINE void arm_circularRead_f32(
+  int32_t * circBuffer,
+  int32_t L,
+  int32_t * readOffset,
+  int32_t bufferInc,
+  int32_t * dst,
+  int32_t * dst_base,
+  int32_t dst_length,
+  int32_t dstInc,
+  uint32_t blockSize)
+  {
+    uint32_t i = 0U;
+    int32_t rOffset;
+    int32_t* dst_end;
+
+    /* Copy the value of Index pointer that points
+     * to the current location from where the input samples to be read */
+    rOffset = *readOffset;
+    dst_end = dst_base + dst_length;
+
+    /* Loop over the blockSize */
+    i = blockSize;
+
+    while (i > 0U)
+    {
+      /* copy the sample from the circular buffer to the destination buffer */
+      *dst = circBuffer[rOffset];
+
+      /* Update the input pointer */
+      dst += dstInc;
+
+      if (dst == dst_end)
+      {
+        dst = dst_base;
+      }
+
+      /* Circularly update rOffset.  Watch out for positive and negative value  */
+      rOffset += bufferInc;
+
+      if (rOffset >= L)
+      {
+        rOffset -= L;
+      }
+
+      /* Decrement the loop counter */
+      i--;
+    }
+
+    /* Update the index pointer */
+    *readOffset = rOffset;
+  }
+
+
+  /**
+   * @brief Q15 Circular write function.
+   */
+  __STATIC_FORCEINLINE void arm_circularWrite_q15(
+  q15_t * circBuffer,
+  int32_t L,
+  uint16_t * writeOffset,
+  int32_t bufferInc,
+  const q15_t * src,
+  int32_t srcInc,
+  uint32_t blockSize)
+  {
+    uint32_t i = 0U;
+    int32_t wOffset;
+
+    /* Copy the value of Index pointer that points
+     * to the current location where the input samples to be copied */
+    wOffset = *writeOffset;
+
+    /* Loop over the blockSize */
+    i = blockSize;
+
+    while (i > 0U)
+    {
+      /* copy the input sample to the circular buffer */
+      circBuffer[wOffset] = *src;
+
+      /* Update the input pointer */
+      src += srcInc;
+
+      /* Circularly update wOffset.  Watch out for positive and negative value */
+      wOffset += bufferInc;
+      if (wOffset >= L)
+        wOffset -= L;
+
+      /* Decrement the loop counter */
+      i--;
+    }
+
+    /* Update the index pointer */
+    *writeOffset = (uint16_t)wOffset;
+  }
+
+
+  /**
+   * @brief Q15 Circular Read function.
+   */
+  __STATIC_FORCEINLINE void arm_circularRead_q15(
+  q15_t * circBuffer,
+  int32_t L,
+  int32_t * readOffset,
+  int32_t bufferInc,
+  q15_t * dst,
+  q15_t * dst_base,
+  int32_t dst_length,
+  int32_t dstInc,
+  uint32_t blockSize)
+  {
+    uint32_t i = 0;
+    int32_t rOffset;
+    q15_t* dst_end;
+
+    /* Copy the value of Index pointer that points
+     * to the current location from where the input samples to be read */
+    rOffset = *readOffset;
+
+    dst_end = dst_base + dst_length;
+
+    /* Loop over the blockSize */
+    i = blockSize;
+
+    while (i > 0U)
+    {
+      /* copy the sample from the circular buffer to the destination buffer */
+      *dst = circBuffer[rOffset];
+
+      /* Update the input pointer */
+      dst += dstInc;
+
+      if (dst == dst_end)
+      {
+        dst = dst_base;
+      }
+
+      /* Circularly update wOffset.  Watch out for positive and negative value */
+      rOffset += bufferInc;
+
+      if (rOffset >= L)
+      {
+        rOffset -= L;
+      }
+
+      /* Decrement the loop counter */
+      i--;
+    }
+
+    /* Update the index pointer */
+    *readOffset = rOffset;
+  }
+
+
+  /**
+   * @brief Q7 Circular write function.
+   */
+  __STATIC_FORCEINLINE void arm_circularWrite_q7(
+  q7_t * circBuffer,
+  int32_t L,
+  uint16_t * writeOffset,
+  int32_t bufferInc,
+  const q7_t * src,
+  int32_t srcInc,
+  uint32_t blockSize)
+  {
+    uint32_t i = 0U;
+    int32_t wOffset;
+
+    /* Copy the value of Index pointer that points
+     * to the current location where the input samples to be copied */
+    wOffset = *writeOffset;
+
+    /* Loop over the blockSize */
+    i = blockSize;
+
+    while (i > 0U)
+    {
+      /* copy the input sample to the circular buffer */
+      circBuffer[wOffset] = *src;
+
+      /* Update the input pointer */
+      src += srcInc;
+
+      /* Circularly update wOffset.  Watch out for positive and negative value */
+      wOffset += bufferInc;
+      if (wOffset >= L)
+        wOffset -= L;
+
+      /* Decrement the loop counter */
+      i--;
+    }
+
+    /* Update the index pointer */
+    *writeOffset = (uint16_t)wOffset;
+  }
+
+
+  /**
+   * @brief Q7 Circular Read function.
+   */
+  __STATIC_FORCEINLINE void arm_circularRead_q7(
+  q7_t * circBuffer,
+  int32_t L,
+  int32_t * readOffset,
+  int32_t bufferInc,
+  q7_t * dst,
+  q7_t * dst_base,
+  int32_t dst_length,
+  int32_t dstInc,
+  uint32_t blockSize)
+  {
+    uint32_t i = 0;
+    int32_t rOffset;
+    q7_t* dst_end;
+
+    /* Copy the value of Index pointer that points
+     * to the current location from where the input samples to be read */
+    rOffset = *readOffset;
+
+    dst_end = dst_base + dst_length;
+
+    /* Loop over the blockSize */
+    i = blockSize;
+
+    while (i > 0U)
+    {
+      /* copy the sample from the circular buffer to the destination buffer */
+      *dst = circBuffer[rOffset];
+
+      /* Update the input pointer */
+      dst += dstInc;
+
+      if (dst == dst_end)
+      {
+        dst = dst_base;
+      }
+
+      /* Circularly update rOffset.  Watch out for positive and negative value */
+      rOffset += bufferInc;
+
+      if (rOffset >= L)
+      {
+        rOffset -= L;
+      }
+
+      /* Decrement the loop counter */
+      i--;
+    }
+
+    /* Update the index pointer */
+    *readOffset = rOffset;
+  }
+
+
+  /**
+   * @brief  Sum of the squares of the elements of a Q31 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void arm_power_q31(
+  const q31_t * pSrc,
+        uint32_t blockSize,
+        q63_t * pResult);
+
+
+  /**
+   * @brief  Sum of the squares of the elements of a floating-point vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void arm_power_f32(
+  const float32_t * pSrc,
+        uint32_t blockSize,
+        float32_t * pResult);
+
+
+  /**
+   * @brief  Sum of the squares of the elements of a Q15 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void arm_power_q15(
+  const q15_t * pSrc,
+        uint32_t blockSize,
+        q63_t * pResult);
+
+
+  /**
+   * @brief  Sum of the squares of the elements of a Q7 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void arm_power_q7(
+  const q7_t * pSrc,
+        uint32_t blockSize,
+        q31_t * pResult);
+
+
+  /**
+   * @brief  Mean value of a Q7 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void arm_mean_q7(
+  const q7_t * pSrc,
+        uint32_t blockSize,
+        q7_t * pResult);
+
+
+  /**
+   * @brief  Mean value of a Q15 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void arm_mean_q15(
+  const q15_t * pSrc,
+        uint32_t blockSize,
+        q15_t * pResult);
+
+
+  /**
+   * @brief  Mean value of a Q31 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void arm_mean_q31(
+  const q31_t * pSrc,
+        uint32_t blockSize,
+        q31_t * pResult);
+
+
+  /**
+   * @brief  Mean value of a floating-point vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void arm_mean_f32(
+  const float32_t * pSrc,
+        uint32_t blockSize,
+        float32_t * pResult);
+
+
+  /**
+   * @brief  Variance of the elements of a floating-point vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void arm_var_f32(
+  const float32_t * pSrc,
+        uint32_t blockSize,
+        float32_t * pResult);
+
+
+  /**
+   * @brief  Variance of the elements of a Q31 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void arm_var_q31(
+  const q31_t * pSrc,
+        uint32_t blockSize,
+        q31_t * pResult);
+
+
+  /**
+   * @brief  Variance of the elements of a Q15 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void arm_var_q15(
+  const q15_t * pSrc,
+        uint32_t blockSize,
+        q15_t * pResult);
+
+
+  /**
+   * @brief  Root Mean Square of the elements of a floating-point vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void arm_rms_f32(
+  const float32_t * pSrc,
+        uint32_t blockSize,
+        float32_t * pResult);
+
+
+  /**
+   * @brief  Root Mean Square of the elements of a Q31 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void arm_rms_q31(
+  const q31_t * pSrc,
+        uint32_t blockSize,
+        q31_t * pResult);
+
+
+  /**
+   * @brief  Root Mean Square of the elements of a Q15 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void arm_rms_q15(
+  const q15_t * pSrc,
+        uint32_t blockSize,
+        q15_t * pResult);
+
+
+  /**
+   * @brief  Standard deviation of the elements of a floating-point vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void arm_std_f32(
+  const float32_t * pSrc,
+        uint32_t blockSize,
+        float32_t * pResult);
+
+
+  /**
+   * @brief  Standard deviation of the elements of a Q31 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void arm_std_q31(
+  const q31_t * pSrc,
+        uint32_t blockSize,
+        q31_t * pResult);
+
+
+  /**
+   * @brief  Standard deviation of the elements of a Q15 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void arm_std_q15(
+  const q15_t * pSrc,
+        uint32_t blockSize,
+        q15_t * pResult);
+
+
+  /**
+   * @brief  Floating-point complex magnitude
+   * @param[in]  pSrc        points to the complex input vector
+   * @param[out] pDst        points to the real output vector
+   * @param[in]  numSamples  number of complex samples in the input vector
+   */
+  void arm_cmplx_mag_f32(
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t numSamples);
+
+
+  /**
+   * @brief  Q31 complex magnitude
+   * @param[in]  pSrc        points to the complex input vector
+   * @param[out] pDst        points to the real output vector
+   * @param[in]  numSamples  number of complex samples in the input vector
+   */
+  void arm_cmplx_mag_q31(
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t numSamples);
+
+
+  /**
+   * @brief  Q15 complex magnitude
+   * @param[in]  pSrc        points to the complex input vector
+   * @param[out] pDst        points to the real output vector
+   * @param[in]  numSamples  number of complex samples in the input vector
+   */
+  void arm_cmplx_mag_q15(
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t numSamples);
+
+
+  /**
+   * @brief  Q15 complex dot product
+   * @param[in]  pSrcA       points to the first input vector
+   * @param[in]  pSrcB       points to the second input vector
+   * @param[in]  numSamples  number of complex samples in each vector
+   * @param[out] realResult  real part of the result returned here
+   * @param[out] imagResult  imaginary part of the result returned here
+   */
+  void arm_cmplx_dot_prod_q15(
+  const q15_t * pSrcA,
+  const q15_t * pSrcB,
+        uint32_t numSamples,
+        q31_t * realResult,
+        q31_t * imagResult);
+
+
+  /**
+   * @brief  Q31 complex dot product
+   * @param[in]  pSrcA       points to the first input vector
+   * @param[in]  pSrcB       points to the second input vector
+   * @param[in]  numSamples  number of complex samples in each vector
+   * @param[out] realResult  real part of the result returned here
+   * @param[out] imagResult  imaginary part of the result returned here
+   */
+  void arm_cmplx_dot_prod_q31(
+  const q31_t * pSrcA,
+  const q31_t * pSrcB,
+        uint32_t numSamples,
+        q63_t * realResult,
+        q63_t * imagResult);
+
+
+  /**
+   * @brief  Floating-point complex dot product
+   * @param[in]  pSrcA       points to the first input vector
+   * @param[in]  pSrcB       points to the second input vector
+   * @param[in]  numSamples  number of complex samples in each vector
+   * @param[out] realResult  real part of the result returned here
+   * @param[out] imagResult  imaginary part of the result returned here
+   */
+  void arm_cmplx_dot_prod_f32(
+  const float32_t * pSrcA,
+  const float32_t * pSrcB,
+        uint32_t numSamples,
+        float32_t * realResult,
+        float32_t * imagResult);
+
+
+  /**
+   * @brief  Q15 complex-by-real multiplication
+   * @param[in]  pSrcCmplx   points to the complex input vector
+   * @param[in]  pSrcReal    points to the real input vector
+   * @param[out] pCmplxDst   points to the complex output vector
+   * @param[in]  numSamples  number of samples in each vector
+   */
+  void arm_cmplx_mult_real_q15(
+  const q15_t * pSrcCmplx,
+  const q15_t * pSrcReal,
+        q15_t * pCmplxDst,
+        uint32_t numSamples);
+
+
+  /**
+   * @brief  Q31 complex-by-real multiplication
+   * @param[in]  pSrcCmplx   points to the complex input vector
+   * @param[in]  pSrcReal    points to the real input vector
+   * @param[out] pCmplxDst   points to the complex output vector
+   * @param[in]  numSamples  number of samples in each vector
+   */
+  void arm_cmplx_mult_real_q31(
+  const q31_t * pSrcCmplx,
+  const q31_t * pSrcReal,
+        q31_t * pCmplxDst,
+        uint32_t numSamples);
+
+
+  /**
+   * @brief  Floating-point complex-by-real multiplication
+   * @param[in]  pSrcCmplx   points to the complex input vector
+   * @param[in]  pSrcReal    points to the real input vector
+   * @param[out] pCmplxDst   points to the complex output vector
+   * @param[in]  numSamples  number of samples in each vector
+   */
+  void arm_cmplx_mult_real_f32(
+  const float32_t * pSrcCmplx,
+  const float32_t * pSrcReal,
+        float32_t * pCmplxDst,
+        uint32_t numSamples);
+
+
+  /**
+   * @brief  Minimum value of a Q7 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] result     is output pointer
+   * @param[in]  index      is the array index of the minimum value in the input buffer.
+   */
+  void arm_min_q7(
+  const q7_t * pSrc,
+        uint32_t blockSize,
+        q7_t * result,
+        uint32_t * index);
+
+
+  /**
+   * @brief  Minimum value of a Q15 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output pointer
+   * @param[in]  pIndex     is the array index of the minimum value in the input buffer.
+   */
+  void arm_min_q15(
+  const q15_t * pSrc,
+        uint32_t blockSize,
+        q15_t * pResult,
+        uint32_t * pIndex);
+
+
+  /**
+   * @brief  Minimum value of a Q31 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output pointer
+   * @param[out] pIndex     is the array index of the minimum value in the input buffer.
+   */
+  void arm_min_q31(
+  const q31_t * pSrc,
+        uint32_t blockSize,
+        q31_t * pResult,
+        uint32_t * pIndex);
+
+
+  /**
+   * @brief  Minimum value of a floating-point vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output pointer
+   * @param[out] pIndex     is the array index of the minimum value in the input buffer.
+   */
+  void arm_min_f32(
+  const float32_t * pSrc,
+        uint32_t blockSize,
+        float32_t * pResult,
+        uint32_t * pIndex);
+
+
+/**
+ * @brief Maximum value of a Q7 vector.
+ * @param[in]  pSrc       points to the input buffer
+ * @param[in]  blockSize  length of the input vector
+ * @param[out] pResult    maximum value returned here
+ * @param[out] pIndex     index of maximum value returned here
+ */
+  void arm_max_q7(
+  const q7_t * pSrc,
+        uint32_t blockSize,
+        q7_t * pResult,
+        uint32_t * pIndex);
+
+
+/**
+ * @brief Maximum value of a Q15 vector.
+ * @param[in]  pSrc       points to the input buffer
+ * @param[in]  blockSize  length of the input vector
+ * @param[out] pResult    maximum value returned here
+ * @param[out] pIndex     index of maximum value returned here
+ */
+  void arm_max_q15(
+  const q15_t * pSrc,
+        uint32_t blockSize,
+        q15_t * pResult,
+        uint32_t * pIndex);
+
+
+/**
+ * @brief Maximum value of a Q31 vector.
+ * @param[in]  pSrc       points to the input buffer
+ * @param[in]  blockSize  length of the input vector
+ * @param[out] pResult    maximum value returned here
+ * @param[out] pIndex     index of maximum value returned here
+ */
+  void arm_max_q31(
+  const q31_t * pSrc,
+        uint32_t blockSize,
+        q31_t * pResult,
+        uint32_t * pIndex);
+
+
+/**
+ * @brief Maximum value of a floating-point vector.
+ * @param[in]  pSrc       points to the input buffer
+ * @param[in]  blockSize  length of the input vector
+ * @param[out] pResult    maximum value returned here
+ * @param[out] pIndex     index of maximum value returned here
+ */
+  void arm_max_f32(
+  const float32_t * pSrc,
+        uint32_t blockSize,
+        float32_t * pResult,
+        uint32_t * pIndex);
+
+  /**
+    @brief         Maximum value of a floating-point vector.
+    @param[in]     pSrc       points to the input vector
+    @param[in]     blockSize  number of samples in input vector
+    @param[out]    pResult    maximum value returned here
+    @return        none
+   */
+  void arm_max_no_idx_f32(
+      const float32_t *pSrc,
+      uint32_t   blockSize,
+      float32_t *pResult);
+
+  /**
+   * @brief  Q15 complex-by-complex multiplication
+   * @param[in]  pSrcA       points to the first input vector
+   * @param[in]  pSrcB       points to the second input vector
+   * @param[out] pDst        points to the output vector
+   * @param[in]  numSamples  number of complex samples in each vector
+   */
+  void arm_cmplx_mult_cmplx_q15(
+  const q15_t * pSrcA,
+  const q15_t * pSrcB,
+        q15_t * pDst,
+        uint32_t numSamples);
+
+
+  /**
+   * @brief  Q31 complex-by-complex multiplication
+   * @param[in]  pSrcA       points to the first input vector
+   * @param[in]  pSrcB       points to the second input vector
+   * @param[out] pDst        points to the output vector
+   * @param[in]  numSamples  number of complex samples in each vector
+   */
+  void arm_cmplx_mult_cmplx_q31(
+  const q31_t * pSrcA,
+  const q31_t * pSrcB,
+        q31_t * pDst,
+        uint32_t numSamples);
+
+
+  /**
+   * @brief  Floating-point complex-by-complex multiplication
+   * @param[in]  pSrcA       points to the first input vector
+   * @param[in]  pSrcB       points to the second input vector
+   * @param[out] pDst        points to the output vector
+   * @param[in]  numSamples  number of complex samples in each vector
+   */
+  void arm_cmplx_mult_cmplx_f32(
+  const float32_t * pSrcA,
+  const float32_t * pSrcB,
+        float32_t * pDst,
+        uint32_t numSamples);
+
+
+  /**
+   * @brief Converts the elements of the floating-point vector to Q31 vector.
+   * @param[in]  pSrc       points to the floating-point input vector
+   * @param[out] pDst       points to the Q31 output vector
+   * @param[in]  blockSize  length of the input vector
+   */
+  void arm_float_to_q31(
+  const float32_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Converts the elements of the floating-point vector to Q15 vector.
+   * @param[in]  pSrc       points to the floating-point input vector
+   * @param[out] pDst       points to the Q15 output vector
+   * @param[in]  blockSize  length of the input vector
+   */
+  void arm_float_to_q15(
+  const float32_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Converts the elements of the floating-point vector to Q7 vector.
+   * @param[in]  pSrc       points to the floating-point input vector
+   * @param[out] pDst       points to the Q7 output vector
+   * @param[in]  blockSize  length of the input vector
+   */
+  void arm_float_to_q7(
+  const float32_t * pSrc,
+        q7_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Converts the elements of the Q31 vector to floating-point vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[out] pDst       is output pointer
+   * @param[in]  blockSize  is the number of samples to process
+   */
+  void arm_q31_to_float(
+  const q31_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Converts the elements of the Q31 vector to Q15 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[out] pDst       is output pointer
+   * @param[in]  blockSize  is the number of samples to process
+   */
+  void arm_q31_to_q15(
+  const q31_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Converts the elements of the Q31 vector to Q7 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[out] pDst       is output pointer
+   * @param[in]  blockSize  is the number of samples to process
+   */
+  void arm_q31_to_q7(
+  const q31_t * pSrc,
+        q7_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Converts the elements of the Q15 vector to floating-point vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[out] pDst       is output pointer
+   * @param[in]  blockSize  is the number of samples to process
+   */
+  void arm_q15_to_float(
+  const q15_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Converts the elements of the Q15 vector to Q31 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[out] pDst       is output pointer
+   * @param[in]  blockSize  is the number of samples to process
+   */
+  void arm_q15_to_q31(
+  const q15_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Converts the elements of the Q15 vector to Q7 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[out] pDst       is output pointer
+   * @param[in]  blockSize  is the number of samples to process
+   */
+  void arm_q15_to_q7(
+  const q15_t * pSrc,
+        q7_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Converts the elements of the Q7 vector to floating-point vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[out] pDst       is output pointer
+   * @param[in]  blockSize  is the number of samples to process
+   */
+  void arm_q7_to_float(
+  const q7_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Converts the elements of the Q7 vector to Q31 vector.
+   * @param[in]  pSrc       input pointer
+   * @param[out] pDst       output pointer
+   * @param[in]  blockSize  number of samples to process
+   */
+  void arm_q7_to_q31(
+  const q7_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Converts the elements of the Q7 vector to Q15 vector.
+   * @param[in]  pSrc       input pointer
+   * @param[out] pDst       output pointer
+   * @param[in]  blockSize  number of samples to process
+   */
+  void arm_q7_to_q15(
+  const q7_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+/**
+ * @brief Struct for specifying SVM Kernel
+ */
+typedef enum
+{
+    ARM_ML_KERNEL_LINEAR = 0,
+             /**< Linear kernel */
+    ARM_ML_KERNEL_POLYNOMIAL = 1,
+             /**< Polynomial kernel */
+    ARM_ML_KERNEL_RBF = 2,
+             /**< Radial Basis Function kernel */
+    ARM_ML_KERNEL_SIGMOID = 3
+             /**< Sigmoid kernel */
+} arm_ml_kernel_type;
+
+
+/**
+ * @brief Instance structure for linear SVM prediction function.
+ */
+typedef struct
+{
+  uint32_t        nbOfSupportVectors;     /**< Number of support vectors */
+  uint32_t        vectorDimension;        /**< Dimension of vector space */
+  float32_t       intercept;              /**< Intercept */
+  const float32_t *dualCoefficients;      /**< Dual coefficients */
+  const float32_t *supportVectors;        /**< Support vectors */
+  const int32_t   *classes;               /**< The two SVM classes */
+} arm_svm_linear_instance_f32;
+
+
+/**
+ * @brief Instance structure for polynomial SVM prediction function.
+ */
+typedef struct
+{
+  uint32_t        nbOfSupportVectors;     /**< Number of support vectors */
+  uint32_t        vectorDimension;        /**< Dimension of vector space */
+  float32_t       intercept;              /**< Intercept */
+  const float32_t *dualCoefficients;      /**< Dual coefficients */
+  const float32_t *supportVectors;        /**< Support vectors */
+  const int32_t   *classes;               /**< The two SVM classes */
+  int32_t         degree;                 /**< Polynomial degree */
+  float32_t       coef0;                  /**< Polynomial constant */
+  float32_t       gamma;                  /**< Gamma factor */
+} arm_svm_polynomial_instance_f32;
+
+/**
+ * @brief Instance structure for rbf SVM prediction function.
+ */
+typedef struct
+{
+  uint32_t        nbOfSupportVectors;     /**< Number of support vectors */
+  uint32_t        vectorDimension;        /**< Dimension of vector space */
+  float32_t       intercept;              /**< Intercept */
+  const float32_t *dualCoefficients;      /**< Dual coefficients */
+  const float32_t *supportVectors;        /**< Support vectors */
+  const int32_t   *classes;               /**< The two SVM classes */
+  float32_t       gamma;                  /**< Gamma factor */
+} arm_svm_rbf_instance_f32;
+
+/**
+ * @brief Instance structure for sigmoid SVM prediction function.
+ */
+typedef struct
+{
+  uint32_t        nbOfSupportVectors;     /**< Number of support vectors */
+  uint32_t        vectorDimension;        /**< Dimension of vector space */
+  float32_t       intercept;              /**< Intercept */
+  const float32_t *dualCoefficients;      /**< Dual coefficients */
+  const float32_t *supportVectors;        /**< Support vectors */
+  const int32_t   *classes;               /**< The two SVM classes */
+  float32_t       coef0;                  /**< Independant constant */
+  float32_t       gamma;                  /**< Gamma factor */
+} arm_svm_sigmoid_instance_f32;
+
+/**
+ * @brief        SVM linear instance init function
+ * @param[in]    S                      Parameters for SVM functions
+ * @param[in]    nbOfSupportVectors     Number of support vectors
+ * @param[in]    vectorDimension        Dimension of vector space
+ * @param[in]    intercept              Intercept
+ * @param[in]    dualCoefficients       Array of dual coefficients
+ * @param[in]    supportVectors         Array of support vectors
+ * @param[in]    classes                Array of 2 classes ID
+ * @return none.
+ *
+ */
+
+
+void arm_svm_linear_init_f32(arm_svm_linear_instance_f32 *S,
+  uint32_t nbOfSupportVectors,
+  uint32_t vectorDimension,
+  float32_t intercept,
+  const float32_t *dualCoefficients,
+  const float32_t *supportVectors,
+  const int32_t  *classes);
+
+/**
+ * @brief SVM linear prediction
+ * @param[in]    S          Pointer to an instance of the linear SVM structure.
+ * @param[in]    in         Pointer to input vector
+ * @param[out]   pResult    Decision value
+ * @return none.
+ *
+ */
+
+void arm_svm_linear_predict_f32(const arm_svm_linear_instance_f32 *S,
+   const float32_t * in,
+   int32_t * pResult);
+
+
+/**
+ * @brief        SVM polynomial instance init function
+ * @param[in]    S                      points to an instance of the polynomial SVM structure.
+ * @param[in]    nbOfSupportVectors     Number of support vectors
+ * @param[in]    vectorDimension        Dimension of vector space
+ * @param[in]    intercept              Intercept
+ * @param[in]    dualCoefficients       Array of dual coefficients
+ * @param[in]    supportVectors         Array of support vectors
+ * @param[in]    classes                Array of 2 classes ID
+ * @param[in]    degree                 Polynomial degree
+ * @param[in]    coef0                  coeff0 (scikit-learn terminology)
+ * @param[in]    gamma                  gamma (scikit-learn terminology)
+ * @return none.
+ *
+ */
+
+
+void arm_svm_polynomial_init_f32(arm_svm_polynomial_instance_f32 *S,
+  uint32_t nbOfSupportVectors,
+  uint32_t vectorDimension,
+  float32_t intercept,
+  const float32_t *dualCoefficients,
+  const float32_t *supportVectors,
+  const int32_t   *classes,
+  int32_t      degree,
+  float32_t coef0,
+  float32_t gamma
+  );
+
+/**
+ * @brief SVM polynomial prediction
+ * @param[in]    S          Pointer to an instance of the polynomial SVM structure.
+ * @param[in]    in         Pointer to input vector
+ * @param[out]   pResult    Decision value
+ * @return none.
+ *
+ */
+void arm_svm_polynomial_predict_f32(const arm_svm_polynomial_instance_f32 *S,
+   const float32_t * in,
+   int32_t * pResult);
+
+
+/**
+ * @brief        SVM radial basis function instance init function
+ * @param[in]    S                      points to an instance of the polynomial SVM structure.
+ * @param[in]    nbOfSupportVectors     Number of support vectors
+ * @param[in]    vectorDimension        Dimension of vector space
+ * @param[in]    intercept              Intercept
+ * @param[in]    dualCoefficients       Array of dual coefficients
+ * @param[in]    supportVectors         Array of support vectors
+ * @param[in]    classes                Array of 2 classes ID
+ * @param[in]    gamma                  gamma (scikit-learn terminology)
+ * @return none.
+ *
+ */
+
+void arm_svm_rbf_init_f32(arm_svm_rbf_instance_f32 *S,
+  uint32_t nbOfSupportVectors,
+  uint32_t vectorDimension,
+  float32_t intercept,
+  const float32_t *dualCoefficients,
+  const float32_t *supportVectors,
+  const int32_t   *classes,
+  float32_t gamma
+  );
+
+/**
+ * @brief SVM rbf prediction
+ * @param[in]    S         Pointer to an instance of the rbf SVM structure.
+ * @param[in]    in        Pointer to input vector
+ * @param[out]   pResult   decision value
+ * @return none.
+ *
+ */
+void arm_svm_rbf_predict_f32(const arm_svm_rbf_instance_f32 *S,
+   const float32_t * in,
+   int32_t * pResult);
+
+/**
+ * @brief        SVM sigmoid instance init function
+ * @param[in]    S                      points to an instance of the rbf SVM structure.
+ * @param[in]    nbOfSupportVectors     Number of support vectors
+ * @param[in]    vectorDimension        Dimension of vector space
+ * @param[in]    intercept              Intercept
+ * @param[in]    dualCoefficients       Array of dual coefficients
+ * @param[in]    supportVectors         Array of support vectors
+ * @param[in]    classes                Array of 2 classes ID
+ * @param[in]    coef0                  coeff0 (scikit-learn terminology)
+ * @param[in]    gamma                  gamma (scikit-learn terminology)
+ * @return none.
+ *
+ */
+
+void arm_svm_sigmoid_init_f32(arm_svm_sigmoid_instance_f32 *S,
+  uint32_t nbOfSupportVectors,
+  uint32_t vectorDimension,
+  float32_t intercept,
+  const float32_t *dualCoefficients,
+  const float32_t *supportVectors,
+  const int32_t   *classes,
+  float32_t coef0,
+  float32_t gamma
+  );
+
+/**
+ * @brief SVM sigmoid prediction
+ * @param[in]    S        Pointer to an instance of the rbf SVM structure.
+ * @param[in]    in       Pointer to input vector
+ * @param[out]   pResult  Decision value
+ * @return none.
+ *
+ */
+void arm_svm_sigmoid_predict_f32(const arm_svm_sigmoid_instance_f32 *S,
+   const float32_t * in,
+   int32_t * pResult);
+
+
+
+/**
+ * @brief Instance structure for Naive Gaussian Bayesian estimator.
+ */
+typedef struct
+{
+  uint32_t vectorDimension;  /**< Dimension of vector space */
+  uint32_t numberOfClasses;  /**< Number of different classes  */
+  const float32_t *theta;          /**< Mean values for the Gaussians */
+  const float32_t *sigma;          /**< Variances for the Gaussians */
+  const float32_t *classPriors;    /**< Class prior probabilities */
+  float32_t epsilon;         /**< Additive value to variances */
+} arm_gaussian_naive_bayes_instance_f32;
+
+/**
+ * @brief Naive Gaussian Bayesian Estimator
+ *
+ * @param[in]  S         points to a naive bayes instance structure
+ * @param[in]  in        points to the elements of the input vector.
+ * @param[in]  pBuffer   points to a buffer of length numberOfClasses
+ * @return The predicted class
+ *
+ */
+
+
+uint32_t arm_gaussian_naive_bayes_predict_f32(const arm_gaussian_naive_bayes_instance_f32 *S,
+   const float32_t * in,
+   float32_t *pBuffer);
+
+/**
+ * @brief Computation of the LogSumExp
+ *
+ * In probabilistic computations, the dynamic of the probability values can be very
+ * wide because they come from gaussian functions.
+ * To avoid underflow and overflow issues, the values are represented by their log.
+ * In this representation, multiplying the original exp values is easy : their logs are added.
+ * But adding the original exp values is requiring some special handling and it is the
+ * goal of the LogSumExp function.
+ *
+ * If the values are x1...xn, the function is computing:
+ *
+ * ln(exp(x1) + ... + exp(xn)) and the computation is done in such a way that
+ * rounding issues are minimised.
+ *
+ * The max xm of the values is extracted and the function is computing:
+ * xm + ln(exp(x1 - xm) + ... + exp(xn - xm))
+ *
+ * @param[in]  *in         Pointer to an array of input values.
+ * @param[in]  blockSize   Number of samples in the input array.
+ * @return LogSumExp
+ *
+ */
+
+
+float32_t arm_logsumexp_f32(const float32_t *in, uint32_t blockSize);
+
+/**
+ * @brief Dot product with log arithmetic
+ *
+ * Vectors are containing the log of the samples
+ *
+ * @param[in]       pSrcA points to the first input vector
+ * @param[in]       pSrcB points to the second input vector
+ * @param[in]       blockSize number of samples in each vector
+ * @param[in]       pTmpBuffer temporary buffer of length blockSize
+ * @return The log of the dot product .
+ *
+ */
+
+
+float32_t arm_logsumexp_dot_prod_f32(const float32_t * pSrcA,
+  const float32_t * pSrcB,
+  uint32_t blockSize,
+  float32_t *pTmpBuffer);
+
+/**
+ * @brief Entropy
+ *
+ * @param[in]  pSrcA        Array of input values.
+ * @param[in]  blockSize    Number of samples in the input array.
+ * @return     Entropy      -Sum(p ln p)
+ *
+ */
+
+
+float32_t arm_entropy_f32(const float32_t * pSrcA,uint32_t blockSize);
+
+
+/**
+ * @brief Entropy
+ *
+ * @param[in]  pSrcA        Array of input values.
+ * @param[in]  blockSize    Number of samples in the input array.
+ * @return     Entropy      -Sum(p ln p)
+ *
+ */
+
+
+float64_t arm_entropy_f64(const float64_t * pSrcA, uint32_t blockSize);
+
+
+/**
+ * @brief Kullback-Leibler
+ *
+ * @param[in]  pSrcA         Pointer to an array of input values for probability distribution A.
+ * @param[in]  pSrcB         Pointer to an array of input values for probability distribution B.
+ * @param[in]  blockSize     Number of samples in the input array.
+ * @return Kullback-Leibler  Divergence D(A || B)
+ *
+ */
+float32_t arm_kullback_leibler_f32(const float32_t * pSrcA
+  ,const float32_t * pSrcB
+  ,uint32_t blockSize);
+
+
+/**
+ * @brief Kullback-Leibler
+ *
+ * @param[in]  pSrcA         Pointer to an array of input values for probability distribution A.
+ * @param[in]  pSrcB         Pointer to an array of input values for probability distribution B.
+ * @param[in]  blockSize     Number of samples in the input array.
+ * @return Kullback-Leibler  Divergence D(A || B)
+ *
+ */
+float64_t arm_kullback_leibler_f64(const float64_t * pSrcA,
+                const float64_t * pSrcB,
+                uint32_t blockSize);
+
+
+/**
+ * @brief Weighted sum
+ *
+ *
+ * @param[in]    *in           Array of input values.
+ * @param[in]    *weigths      Weights
+ * @param[in]    blockSize     Number of samples in the input array.
+ * @return Weighted sum
+ *
+ */
+float32_t arm_weighted_sum_f32(const float32_t *in
+  , const float32_t *weigths
+  , uint32_t blockSize);
+
+
+/**
+ * @brief Barycenter
+ *
+ *
+ * @param[in]    in         List of vectors
+ * @param[in]    weights    Weights of the vectors
+ * @param[out]   out        Barycenter
+ * @param[in]    nbVectors  Number of vectors
+ * @param[in]    vecDim     Dimension of space (vector dimension)
+ * @return       None
+ *
+ */
+void arm_barycenter_f32(const float32_t *in
+  , const float32_t *weights
+  , float32_t *out
+  , uint32_t nbVectors
+  , uint32_t vecDim);
+
+/**
+ * @brief        Euclidean distance between two vectors
+ * @param[in]    pA         First vector
+ * @param[in]    pB         Second vector
+ * @param[in]    blockSize  vector length
+ * @return distance
+ *
+ */
+
+float32_t arm_euclidean_distance_f32(const float32_t *pA,const float32_t *pB, uint32_t blockSize);
+
+/**
+ * @brief        Bray-Curtis distance between two vectors
+ * @param[in]    pA         First vector
+ * @param[in]    pB         Second vector
+ * @param[in]    blockSize  vector length
+ * @return distance
+ *
+ */
+float32_t arm_braycurtis_distance_f32(const float32_t *pA,const float32_t *pB, uint32_t blockSize);
+
+/**
+ * @brief        Canberra distance between two vectors
+ *
+ * This function may divide by zero when samples pA[i] and pB[i] are both zero.
+ * The result of the computation will be correct. So the division per zero may be
+ * ignored.
+ *
+ * @param[in]    pA         First vector
+ * @param[in]    pB         Second vector
+ * @param[in]    blockSize  vector length
+ * @return distance
+ *
+ */
+float32_t arm_canberra_distance_f32(const float32_t *pA,const float32_t *pB, uint32_t blockSize);
+
+
+/**
+ * @brief        Chebyshev distance between two vectors
+ * @param[in]    pA         First vector
+ * @param[in]    pB         Second vector
+ * @param[in]    blockSize  vector length
+ * @return distance
+ *
+ */
+float32_t arm_chebyshev_distance_f32(const float32_t *pA,const float32_t *pB, uint32_t blockSize);
+
+
+/**
+ * @brief        Cityblock (Manhattan) distance between two vectors
+ * @param[in]    pA         First vector
+ * @param[in]    pB         Second vector
+ * @param[in]    blockSize  vector length
+ * @return distance
+ *
+ */
+float32_t arm_cityblock_distance_f32(const float32_t *pA,const float32_t *pB, uint32_t blockSize);
+
+/**
+ * @brief        Correlation distance between two vectors
+ *
+ * The input vectors are modified in place !
+ *
+ * @param[in]    pA         First vector
+ * @param[in]    pB         Second vector
+ * @param[in]    blockSize  vector length
+ * @return distance
+ *
+ */
+float32_t arm_correlation_distance_f32(float32_t *pA,float32_t *pB, uint32_t blockSize);
+
+/**
+ * @brief        Cosine distance between two vectors
+ *
+ * @param[in]    pA         First vector
+ * @param[in]    pB         Second vector
+ * @param[in]    blockSize  vector length
+ * @return distance
+ *
+ */
+
+float32_t arm_cosine_distance_f32(const float32_t *pA,const float32_t *pB, uint32_t blockSize);
+
+/**
+ * @brief        Jensen-Shannon distance between two vectors
+ *
+ * This function is assuming that elements of second vector are > 0
+ * and 0 only when the corresponding element of first vector is 0.
+ * Otherwise the result of the computation does not make sense
+ * and for speed reasons, the cases returning NaN or Infinity are not
+ * managed.
+ *
+ * When the function is computing x log (x / y) with x 0 and y 0,
+ * it will compute the right value (0) but a division per zero will occur
+ * and shoudl be ignored in client code.
+ *
+ * @param[in]    pA         First vector
+ * @param[in]    pB         Second vector
+ * @param[in]    blockSize  vector length
+ * @return distance
+ *
+ */
+
+float32_t arm_jensenshannon_distance_f32(const float32_t *pA,const float32_t *pB,uint32_t blockSize);
+
+/**
+ * @brief        Minkowski distance between two vectors
+ *
+ * @param[in]    pA         First vector
+ * @param[in]    pB         Second vector
+ * @param[in]    n          Norm order (>= 2)
+ * @param[in]    blockSize  vector length
+ * @return distance
+ *
+ */
+
+
+
+float32_t arm_minkowski_distance_f32(const float32_t *pA,const float32_t *pB, int32_t order, uint32_t blockSize);
+
+/**
+ * @brief        Dice distance between two vectors
+ *
+ * @param[in]    pA              First vector of packed booleans
+ * @param[in]    pB              Second vector of packed booleans
+ * @param[in]    order           Distance order
+ * @param[in]    blockSize       Number of samples
+ * @return distance
+ *
+ */
+
+
+float32_t arm_dice_distance(const uint32_t *pA, const uint32_t *pB, uint32_t numberOfBools);
+
+/**
+ * @brief        Hamming distance between two vectors
+ *
+ * @param[in]    pA              First vector of packed booleans
+ * @param[in]    pB              Second vector of packed booleans
+ * @param[in]    numberOfBools   Number of booleans
+ * @return distance
+ *
+ */
+
+float32_t arm_hamming_distance(const uint32_t *pA, const uint32_t *pB, uint32_t numberOfBools);
+
+/**
+ * @brief        Jaccard distance between two vectors
+ *
+ * @param[in]    pA              First vector of packed booleans
+ * @param[in]    pB              Second vector of packed booleans
+ * @param[in]    numberOfBools   Number of booleans
+ * @return distance
+ *
+ */
+
+float32_t arm_jaccard_distance(const uint32_t *pA, const uint32_t *pB, uint32_t numberOfBools);
+
+/**
+ * @brief        Kulsinski distance between two vectors
+ *
+ * @param[in]    pA              First vector of packed booleans
+ * @param[in]    pB              Second vector of packed booleans
+ * @param[in]    numberOfBools   Number of booleans
+ * @return distance
+ *
+ */
+
+float32_t arm_kulsinski_distance(const uint32_t *pA, const uint32_t *pB, uint32_t numberOfBools);
+
+/**
+ * @brief        Roger Stanimoto distance between two vectors
+ *
+ * @param[in]    pA              First vector of packed booleans
+ * @param[in]    pB              Second vector of packed booleans
+ * @param[in]    numberOfBools   Number of booleans
+ * @return distance
+ *
+ */
+
+float32_t arm_rogerstanimoto_distance(const uint32_t *pA, const uint32_t *pB, uint32_t numberOfBools);
+
+/**
+ * @brief        Russell-Rao distance between two vectors
+ *
+ * @param[in]    pA              First vector of packed booleans
+ * @param[in]    pB              Second vector of packed booleans
+ * @param[in]    numberOfBools   Number of booleans
+ * @return distance
+ *
+ */
+
+float32_t arm_russellrao_distance(const uint32_t *pA, const uint32_t *pB, uint32_t numberOfBools);
+
+/**
+ * @brief        Sokal-Michener distance between two vectors
+ *
+ * @param[in]    pA              First vector of packed booleans
+ * @param[in]    pB              Second vector of packed booleans
+ * @param[in]    numberOfBools   Number of booleans
+ * @return distance
+ *
+ */
+
+float32_t arm_sokalmichener_distance(const uint32_t *pA, const uint32_t *pB, uint32_t numberOfBools);
+
+/**
+ * @brief        Sokal-Sneath distance between two vectors
+ *
+ * @param[in]    pA              First vector of packed booleans
+ * @param[in]    pB              Second vector of packed booleans
+ * @param[in]    numberOfBools   Number of booleans
+ * @return distance
+ *
+ */
+
+float32_t arm_sokalsneath_distance(const uint32_t *pA, const uint32_t *pB, uint32_t numberOfBools);
+
+/**
+ * @brief        Yule distance between two vectors
+ *
+ * @param[in]    pA              First vector of packed booleans
+ * @param[in]    pB              Second vector of packed booleans
+ * @param[in]    numberOfBools   Number of booleans
+ * @return distance
+ *
+ */
+
+float32_t arm_yule_distance(const uint32_t *pA, const uint32_t *pB, uint32_t numberOfBools);
+
+
+  /**
+   * @ingroup groupInterpolation
+   */
+
+  /**
+   * @defgroup BilinearInterpolate Bilinear Interpolation
+   *
+   * Bilinear interpolation is an extension of linear interpolation applied to a two dimensional grid.
+   * The underlying function <code>f(x, y)</code> is sampled on a regular grid and the interpolation process
+   * determines values between the grid points.
+   * Bilinear interpolation is equivalent to two step linear interpolation, first in the x-dimension and then in the y-dimension.
+   * Bilinear interpolation is often used in image processing to rescale images.
+   * The CMSIS DSP library provides bilinear interpolation functions for Q7, Q15, Q31, and floating-point data types.
+   *
+   * <b>Algorithm</b>
+   * \par
+   * The instance structure used by the bilinear interpolation functions describes a two dimensional data table.
+   * For floating-point, the instance structure is defined as:
+   * <pre>
+   *   typedef struct
+   *   {
+   *     uint16_t numRows;
+   *     uint16_t numCols;
+   *     float32_t *pData;
+   * } arm_bilinear_interp_instance_f32;
+   * </pre>
+   *
+   * \par
+   * where <code>numRows</code> specifies the number of rows in the table;
+   * <code>numCols</code> specifies the number of columns in the table;
+   * and <code>pData</code> points to an array of size <code>numRows*numCols</code> values.
+   * The data table <code>pTable</code> is organized in row order and the supplied data values fall on integer indexes.
+   * That is, table element (x,y) is located at <code>pTable[x + y*numCols]</code> where x and y are integers.
+   *
+   * \par
+   * Let <code>(x, y)</code> specify the desired interpolation point.  Then define:
+   * <pre>
+   *     XF = floor(x)
+   *     YF = floor(y)
+   * </pre>
+   * \par
+   * The interpolated output point is computed as:
+   * <pre>
+   *  f(x, y) = f(XF, YF) * (1-(x-XF)) * (1-(y-YF))
+   *           + f(XF+1, YF) * (x-XF)*(1-(y-YF))
+   *           + f(XF, YF+1) * (1-(x-XF))*(y-YF)
+   *           + f(XF+1, YF+1) * (x-XF)*(y-YF)
+   * </pre>
+   * Note that the coordinates (x, y) contain integer and fractional components.
+   * The integer components specify which portion of the table to use while the
+   * fractional components control the interpolation processor.
+   *
+   * \par
+   * if (x,y) are outside of the table boundary, Bilinear interpolation returns zero output.
+   */
+
+
+  /**
+   * @addtogroup BilinearInterpolate
+   * @{
+   */
+
+  /**
+  * @brief  Floating-point bilinear interpolation.
+  * @param[in,out] S  points to an instance of the interpolation structure.
+  * @param[in]     X  interpolation coordinate.
+  * @param[in]     Y  interpolation coordinate.
+  * @return out interpolated value.
+  */
+  __STATIC_FORCEINLINE float32_t arm_bilinear_interp_f32(
+  const arm_bilinear_interp_instance_f32 * S,
+  float32_t X,
+  float32_t Y)
+  {
+    float32_t out;
+    float32_t f00, f01, f10, f11;
+    float32_t *pData = S->pData;
+    int32_t xIndex, yIndex, index;
+    float32_t xdiff, ydiff;
+    float32_t b1, b2, b3, b4;
+
+    xIndex = (int32_t) X;
+    yIndex = (int32_t) Y;
+
+    /* Care taken for table outside boundary */
+    /* Returns zero output when values are outside table boundary */
+    if (xIndex < 0 || xIndex > (S->numCols - 2) || yIndex < 0 || yIndex > (S->numRows - 2))
+    {
+      return (0);
+    }
+
+    /* Calculation of index for two nearest points in X-direction */
+    index = (xIndex ) + (yIndex ) * S->numCols;
+
+
+    /* Read two nearest points in X-direction */
+    f00 = pData[index];
+    f01 = pData[index + 1];
+
+    /* Calculation of index for two nearest points in Y-direction */
+    index = (xIndex ) + (yIndex+1) * S->numCols;
+
+
+    /* Read two nearest points in Y-direction */
+    f10 = pData[index];
+    f11 = pData[index + 1];
+
+    /* Calculation of intermediate values */
+    b1 = f00;
+    b2 = f01 - f00;
+    b3 = f10 - f00;
+    b4 = f00 - f01 - f10 + f11;
+
+    /* Calculation of fractional part in X */
+    xdiff = X - xIndex;
+
+    /* Calculation of fractional part in Y */
+    ydiff = Y - yIndex;
+
+    /* Calculation of bi-linear interpolated output */
+    out = b1 + b2 * xdiff + b3 * ydiff + b4 * xdiff * ydiff;
+
+    /* return to application */
+    return (out);
+  }
+
+
+  /**
+  * @brief  Q31 bilinear interpolation.
+  * @param[in,out] S  points to an instance of the interpolation structure.
+  * @param[in]     X  interpolation coordinate in 12.20 format.
+  * @param[in]     Y  interpolation coordinate in 12.20 format.
+  * @return out interpolated value.
+  */
+  __STATIC_FORCEINLINE q31_t arm_bilinear_interp_q31(
+  arm_bilinear_interp_instance_q31 * S,
+  q31_t X,
+  q31_t Y)
+  {
+    q31_t out;                                   /* Temporary output */
+    q31_t acc = 0;                               /* output */
+    q31_t xfract, yfract;                        /* X, Y fractional parts */
+    q31_t x1, x2, y1, y2;                        /* Nearest output values */
+    int32_t rI, cI;                              /* Row and column indices */
+    q31_t *pYData = S->pData;                    /* pointer to output table values */
+    uint32_t nCols = S->numCols;                 /* num of rows */
+
+    /* Input is in 12.20 format */
+    /* 12 bits for the table index */
+    /* Index value calculation */
+    rI = ((X & (q31_t)0xFFF00000) >> 20);
+
+    /* Input is in 12.20 format */
+    /* 12 bits for the table index */
+    /* Index value calculation */
+    cI = ((Y & (q31_t)0xFFF00000) >> 20);
+
+    /* Care taken for table outside boundary */
+    /* Returns zero output when values are outside table boundary */
+    if (rI < 0 || rI > (S->numCols - 2) || cI < 0 || cI > (S->numRows - 2))
+    {
+      return (0);
+    }
+
+    /* 20 bits for the fractional part */
+    /* shift left xfract by 11 to keep 1.31 format */
+    xfract = (X & 0x000FFFFF) << 11U;
+
+    /* Read two nearest output values from the index */
+    x1 = pYData[(rI) + (int32_t)nCols * (cI)    ];
+    x2 = pYData[(rI) + (int32_t)nCols * (cI) + 1];
+
+    /* 20 bits for the fractional part */
+    /* shift left yfract by 11 to keep 1.31 format */
+    yfract = (Y & 0x000FFFFF) << 11U;
+
+    /* Read two nearest output values from the index */
+    y1 = pYData[(rI) + (int32_t)nCols * (cI + 1)    ];
+    y2 = pYData[(rI) + (int32_t)nCols * (cI + 1) + 1];
+
+    /* Calculation of x1 * (1-xfract ) * (1-yfract) and acc is in 3.29(q29) format */
+    out = ((q31_t) (((q63_t) x1  * (0x7FFFFFFF - xfract)) >> 32));
+    acc = ((q31_t) (((q63_t) out * (0x7FFFFFFF - yfract)) >> 32));
+
+    /* x2 * (xfract) * (1-yfract)  in 3.29(q29) and adding to acc */
+    out = ((q31_t) ((q63_t) x2 * (0x7FFFFFFF - yfract) >> 32));
+    acc += ((q31_t) ((q63_t) out * (xfract) >> 32));
+
+    /* y1 * (1 - xfract) * (yfract)  in 3.29(q29) and adding to acc */
+    out = ((q31_t) ((q63_t) y1 * (0x7FFFFFFF - xfract) >> 32));
+    acc += ((q31_t) ((q63_t) out * (yfract) >> 32));
+
+    /* y2 * (xfract) * (yfract)  in 3.29(q29) and adding to acc */
+    out = ((q31_t) ((q63_t) y2 * (xfract) >> 32));
+    acc += ((q31_t) ((q63_t) out * (yfract) >> 32));
+
+    /* Convert acc to 1.31(q31) format */
+    return ((q31_t)(acc << 2));
+  }
+
+
+  /**
+  * @brief  Q15 bilinear interpolation.
+  * @param[in,out] S  points to an instance of the interpolation structure.
+  * @param[in]     X  interpolation coordinate in 12.20 format.
+  * @param[in]     Y  interpolation coordinate in 12.20 format.
+  * @return out interpolated value.
+  */
+  __STATIC_FORCEINLINE q15_t arm_bilinear_interp_q15(
+  arm_bilinear_interp_instance_q15 * S,
+  q31_t X,
+  q31_t Y)
+  {
+    q63_t acc = 0;                               /* output */
+    q31_t out;                                   /* Temporary output */
+    q15_t x1, x2, y1, y2;                        /* Nearest output values */
+    q31_t xfract, yfract;                        /* X, Y fractional parts */
+    int32_t rI, cI;                              /* Row and column indices */
+    q15_t *pYData = S->pData;                    /* pointer to output table values */
+    uint32_t nCols = S->numCols;                 /* num of rows */
+
+    /* Input is in 12.20 format */
+    /* 12 bits for the table index */
+    /* Index value calculation */
+    rI = ((X & (q31_t)0xFFF00000) >> 20);
+
+    /* Input is in 12.20 format */
+    /* 12 bits for the table index */
+    /* Index value calculation */
+    cI = ((Y & (q31_t)0xFFF00000) >> 20);
+
+    /* Care taken for table outside boundary */
+    /* Returns zero output when values are outside table boundary */
+    if (rI < 0 || rI > (S->numCols - 2) || cI < 0 || cI > (S->numRows - 2))
+    {
+      return (0);
+    }
+
+    /* 20 bits for the fractional part */
+    /* xfract should be in 12.20 format */
+    xfract = (X & 0x000FFFFF);
+
+    /* Read two nearest output values from the index */
+    x1 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI)    ];
+    x2 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI) + 1];
+
+    /* 20 bits for the fractional part */
+    /* yfract should be in 12.20 format */
+    yfract = (Y & 0x000FFFFF);
+
+    /* Read two nearest output values from the index */
+    y1 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI + 1)    ];
+    y2 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI + 1) + 1];
+
+    /* Calculation of x1 * (1-xfract ) * (1-yfract) and acc is in 13.51 format */
+
+    /* x1 is in 1.15(q15), xfract in 12.20 format and out is in 13.35 format */
+    /* convert 13.35 to 13.31 by right shifting  and out is in 1.31 */
+    out = (q31_t) (((q63_t) x1 * (0x0FFFFF - xfract)) >> 4U);
+    acc = ((q63_t) out * (0x0FFFFF - yfract));
+
+    /* x2 * (xfract) * (1-yfract)  in 1.51 and adding to acc */
+    out = (q31_t) (((q63_t) x2 * (0x0FFFFF - yfract)) >> 4U);
+    acc += ((q63_t) out * (xfract));
+
+    /* y1 * (1 - xfract) * (yfract)  in 1.51 and adding to acc */
+    out = (q31_t) (((q63_t) y1 * (0x0FFFFF - xfract)) >> 4U);
+    acc += ((q63_t) out * (yfract));
+
+    /* y2 * (xfract) * (yfract)  in 1.51 and adding to acc */
+    out = (q31_t) (((q63_t) y2 * (xfract)) >> 4U);
+    acc += ((q63_t) out * (yfract));
+
+    /* acc is in 13.51 format and down shift acc by 36 times */
+    /* Convert out to 1.15 format */
+    return ((q15_t)(acc >> 36));
+  }
+
+
+  /**
+  * @brief  Q7 bilinear interpolation.
+  * @param[in,out] S  points to an instance of the interpolation structure.
+  * @param[in]     X  interpolation coordinate in 12.20 format.
+  * @param[in]     Y  interpolation coordinate in 12.20 format.
+  * @return out interpolated value.
+  */
+  __STATIC_FORCEINLINE q7_t arm_bilinear_interp_q7(
+  arm_bilinear_interp_instance_q7 * S,
+  q31_t X,
+  q31_t Y)
+  {
+    q63_t acc = 0;                               /* output */
+    q31_t out;                                   /* Temporary output */
+    q31_t xfract, yfract;                        /* X, Y fractional parts */
+    q7_t x1, x2, y1, y2;                         /* Nearest output values */
+    int32_t rI, cI;                              /* Row and column indices */
+    q7_t *pYData = S->pData;                     /* pointer to output table values */
+    uint32_t nCols = S->numCols;                 /* num of rows */
+
+    /* Input is in 12.20 format */
+    /* 12 bits for the table index */
+    /* Index value calculation */
+    rI = ((X & (q31_t)0xFFF00000) >> 20);
+
+    /* Input is in 12.20 format */
+    /* 12 bits for the table index */
+    /* Index value calculation */
+    cI = ((Y & (q31_t)0xFFF00000) >> 20);
+
+    /* Care taken for table outside boundary */
+    /* Returns zero output when values are outside table boundary */
+    if (rI < 0 || rI > (S->numCols - 2) || cI < 0 || cI > (S->numRows - 2))
+    {
+      return (0);
+    }
+
+    /* 20 bits for the fractional part */
+    /* xfract should be in 12.20 format */
+    xfract = (X & (q31_t)0x000FFFFF);
+
+    /* Read two nearest output values from the index */
+    x1 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI)    ];
+    x2 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI) + 1];
+
+    /* 20 bits for the fractional part */
+    /* yfract should be in 12.20 format */
+    yfract = (Y & (q31_t)0x000FFFFF);
+
+    /* Read two nearest output values from the index */
+    y1 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI + 1)    ];
+    y2 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI + 1) + 1];
+
+    /* Calculation of x1 * (1-xfract ) * (1-yfract) and acc is in 16.47 format */
+    out = ((x1 * (0xFFFFF - xfract)));
+    acc = (((q63_t) out * (0xFFFFF - yfract)));
+
+    /* x2 * (xfract) * (1-yfract)  in 2.22 and adding to acc */
+    out = ((x2 * (0xFFFFF - yfract)));
+    acc += (((q63_t) out * (xfract)));
+
+    /* y1 * (1 - xfract) * (yfract)  in 2.22 and adding to acc */
+    out = ((y1 * (0xFFFFF - xfract)));
+    acc += (((q63_t) out * (yfract)));
+
+    /* y2 * (xfract) * (yfract)  in 2.22 and adding to acc */
+    out = ((y2 * (yfract)));
+    acc += (((q63_t) out * (xfract)));
+
+    /* acc in 16.47 format and down shift by 40 to convert to 1.7 format */
+    return ((q7_t)(acc >> 40));
+  }
+
+  /**
+   * @} end of BilinearInterpolate group
+   */
+
+
+/* SMMLAR */
+#define multAcc_32x32_keep32_R(a, x, y) \
+    a = (q31_t) (((((q63_t) a) << 32) + ((q63_t) x * y) + 0x80000000LL ) >> 32)
+
+/* SMMLSR */
+#define multSub_32x32_keep32_R(a, x, y) \
+    a = (q31_t) (((((q63_t) a) << 32) - ((q63_t) x * y) + 0x80000000LL ) >> 32)
+
+/* SMMULR */
+#define mult_32x32_keep32_R(a, x, y) \
+    a = (q31_t) (((q63_t) x * y + 0x80000000LL ) >> 32)
+
+/* SMMLA */
+#define multAcc_32x32_keep32(a, x, y) \
+    a += (q31_t) (((q63_t) x * y) >> 32)
+
+/* SMMLS */
+#define multSub_32x32_keep32(a, x, y) \
+    a -= (q31_t) (((q63_t) x * y) >> 32)
+
+/* SMMUL */
+#define mult_32x32_keep32(a, x, y) \
+    a = (q31_t) (((q63_t) x * y ) >> 32)
+
+
+#if   defined ( __CC_ARM )
+  /* Enter low optimization region - place directly above function definition */
+  #if defined( __ARM_ARCH_7EM__ )
+    #define LOW_OPTIMIZATION_ENTER \
+       _Pragma ("push")         \
+       _Pragma ("O1")
+  #else
+    #define LOW_OPTIMIZATION_ENTER
+  #endif
+
+  /* Exit low optimization region - place directly after end of function definition */
+  #if defined ( __ARM_ARCH_7EM__ )
+    #define LOW_OPTIMIZATION_EXIT \
+       _Pragma ("pop")
+  #else
+    #define LOW_OPTIMIZATION_EXIT
+  #endif
+
+  /* Enter low optimization region - place directly above function definition */
+  #define IAR_ONLY_LOW_OPTIMIZATION_ENTER
+
+  /* Exit low optimization region - place directly after end of function definition */
+  #define IAR_ONLY_LOW_OPTIMIZATION_EXIT
+
+#elif defined (__ARMCC_VERSION ) && ( __ARMCC_VERSION >= 6010050 )
+  #define LOW_OPTIMIZATION_ENTER
+  #define LOW_OPTIMIZATION_EXIT
+  #define IAR_ONLY_LOW_OPTIMIZATION_ENTER
+  #define IAR_ONLY_LOW_OPTIMIZATION_EXIT
+
+#elif defined ( __GNUC__ )
+  #define LOW_OPTIMIZATION_ENTER \
+       __attribute__(( optimize("-O1") ))
+  #define LOW_OPTIMIZATION_EXIT
+  #define IAR_ONLY_LOW_OPTIMIZATION_ENTER
+  #define IAR_ONLY_LOW_OPTIMIZATION_EXIT
+
+#elif defined ( __ICCARM__ )
+  /* Enter low optimization region - place directly above function definition */
+  #if defined ( __ARM_ARCH_7EM__ )
+    #define LOW_OPTIMIZATION_ENTER \
+       _Pragma ("optimize=low")
+  #else
+    #define LOW_OPTIMIZATION_ENTER
+  #endif
+
+  /* Exit low optimization region - place directly after end of function definition */
+  #define LOW_OPTIMIZATION_EXIT
+
+  /* Enter low optimization region - place directly above function definition */
+  #if defined ( __ARM_ARCH_7EM__ )
+    #define IAR_ONLY_LOW_OPTIMIZATION_ENTER \
+       _Pragma ("optimize=low")
+  #else
+    #define IAR_ONLY_LOW_OPTIMIZATION_ENTER
+  #endif
+
+  /* Exit low optimization region - place directly after end of function definition */
+  #define IAR_ONLY_LOW_OPTIMIZATION_EXIT
+
+#elif defined ( __TI_ARM__ )
+  #define LOW_OPTIMIZATION_ENTER
+  #define LOW_OPTIMIZATION_EXIT
+  #define IAR_ONLY_LOW_OPTIMIZATION_ENTER
+  #define IAR_ONLY_LOW_OPTIMIZATION_EXIT
+
+#elif defined ( __CSMC__ )
+  #define LOW_OPTIMIZATION_ENTER
+  #define LOW_OPTIMIZATION_EXIT
+  #define IAR_ONLY_LOW_OPTIMIZATION_ENTER
+  #define IAR_ONLY_LOW_OPTIMIZATION_EXIT
+
+#elif defined ( __TASKING__ )
+  #define LOW_OPTIMIZATION_ENTER
+  #define LOW_OPTIMIZATION_EXIT
+  #define IAR_ONLY_LOW_OPTIMIZATION_ENTER
+  #define IAR_ONLY_LOW_OPTIMIZATION_EXIT
+
+#elif defined ( _MSC_VER ) || defined(__GNUC_PYTHON__)
+      #define LOW_OPTIMIZATION_ENTER
+      #define LOW_OPTIMIZATION_EXIT
+      #define IAR_ONLY_LOW_OPTIMIZATION_ENTER
+      #define IAR_ONLY_LOW_OPTIMIZATION_EXIT
+#endif
+
+
+
+/* Compiler specific diagnostic adjustment */
+#if   defined ( __CC_ARM )
+
+#elif defined ( __ARMCC_VERSION ) && ( __ARMCC_VERSION >= 6010050 )
+
+#elif defined ( __GNUC__ )
+#pragma GCC diagnostic pop
+
+#elif defined ( __ICCARM__ )
+
+#elif defined ( __TI_ARM__ )
+
+#elif defined ( __CSMC__ )
+
+#elif defined ( __TASKING__ )
+
+#elif defined ( _MSC_VER )
+
+#else
+  #error Unknown compiler
+#endif
+
+#ifdef   __cplusplus
+}
+#endif
+
+
+#endif /* _ARM_MATH_H */
+
+/**
+ *
+ * End of file.
+ */

+ 235 - 0
libraries/cmsis/cm4/core_support/arm_mve_tables.h

@@ -0,0 +1,235 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mve_tables.h
+ * Description:  common tables like fft twiddle factors, Bitreverse, reciprocal etc
+ *               used for MVE implementation only
+ *
+ * $Date:        08. January 2020
+ * $Revision:    V1.7.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2020 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ #ifndef _ARM_MVE_TABLES_H
+ #define _ARM_MVE_TABLES_H
+
+ #include "arm_math.h"
+
+
+
+
+
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_16) || defined(ARM_TABLE_TWIDDLECOEF_F32_32)
+
+extern uint32_t rearranged_twiddle_tab_stride1_arr_16_f32[2];
+extern uint32_t rearranged_twiddle_tab_stride2_arr_16_f32[2];
+extern uint32_t rearranged_twiddle_tab_stride3_arr_16_f32[2];
+extern float32_t rearranged_twiddle_stride1_16_f32[8];
+extern float32_t rearranged_twiddle_stride2_16_f32[8];
+extern float32_t rearranged_twiddle_stride3_16_f32[8];
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_64) || defined(ARM_TABLE_TWIDDLECOEF_F32_128)
+
+extern uint32_t rearranged_twiddle_tab_stride1_arr_64_f32[3];
+extern uint32_t rearranged_twiddle_tab_stride2_arr_64_f32[3];
+extern uint32_t rearranged_twiddle_tab_stride3_arr_64_f32[3];
+extern float32_t rearranged_twiddle_stride1_64_f32[40];
+extern float32_t rearranged_twiddle_stride2_64_f32[40];
+extern float32_t rearranged_twiddle_stride3_64_f32[40];
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_256) || defined(ARM_TABLE_TWIDDLECOEF_F32_512)
+
+extern uint32_t rearranged_twiddle_tab_stride1_arr_256_f32[4];
+extern uint32_t rearranged_twiddle_tab_stride2_arr_256_f32[4];
+extern uint32_t rearranged_twiddle_tab_stride3_arr_256_f32[4];
+extern float32_t rearranged_twiddle_stride1_256_f32[168];
+extern float32_t rearranged_twiddle_stride2_256_f32[168];
+extern float32_t rearranged_twiddle_stride3_256_f32[168];
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_1024) || defined(ARM_TABLE_TWIDDLECOEF_F32_2048)
+
+extern uint32_t rearranged_twiddle_tab_stride1_arr_1024_f32[5];
+extern uint32_t rearranged_twiddle_tab_stride2_arr_1024_f32[5];
+extern uint32_t rearranged_twiddle_tab_stride3_arr_1024_f32[5];
+extern float32_t rearranged_twiddle_stride1_1024_f32[680];
+extern float32_t rearranged_twiddle_stride2_1024_f32[680];
+extern float32_t rearranged_twiddle_stride3_1024_f32[680];
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_4096) || defined(ARM_TABLE_TWIDDLECOEF_F32_8192)
+
+extern uint32_t rearranged_twiddle_tab_stride1_arr_4096_f32[6];
+extern uint32_t rearranged_twiddle_tab_stride2_arr_4096_f32[6];
+extern uint32_t rearranged_twiddle_tab_stride3_arr_4096_f32[6];
+extern float32_t rearranged_twiddle_stride1_4096_f32[2728];
+extern float32_t rearranged_twiddle_stride2_4096_f32[2728];
+extern float32_t rearranged_twiddle_stride3_4096_f32[2728];
+#endif
+
+
+#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) */
+
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+
+
+#if defined(ARM_MATH_MVEI)
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_16) || defined(ARM_TABLE_TWIDDLECOEF_Q31_32)
+
+extern uint32_t rearranged_twiddle_tab_stride1_arr_16_q31[2];
+extern uint32_t rearranged_twiddle_tab_stride2_arr_16_q31[2];
+extern uint32_t rearranged_twiddle_tab_stride3_arr_16_q31[2];
+extern q31_t rearranged_twiddle_stride1_16_q31[8];
+extern q31_t rearranged_twiddle_stride2_16_q31[8];
+extern q31_t rearranged_twiddle_stride3_16_q31[8];
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_64) || defined(ARM_TABLE_TWIDDLECOEF_Q31_128)
+
+extern uint32_t rearranged_twiddle_tab_stride1_arr_64_q31[3];
+extern uint32_t rearranged_twiddle_tab_stride2_arr_64_q31[3];
+extern uint32_t rearranged_twiddle_tab_stride3_arr_64_q31[3];
+extern q31_t rearranged_twiddle_stride1_64_q31[40];
+extern q31_t rearranged_twiddle_stride2_64_q31[40];
+extern q31_t rearranged_twiddle_stride3_64_q31[40];
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_256) || defined(ARM_TABLE_TWIDDLECOEF_Q31_512)
+
+extern uint32_t rearranged_twiddle_tab_stride1_arr_256_q31[4];
+extern uint32_t rearranged_twiddle_tab_stride2_arr_256_q31[4];
+extern uint32_t rearranged_twiddle_tab_stride3_arr_256_q31[4];
+extern q31_t rearranged_twiddle_stride1_256_q31[168];
+extern q31_t rearranged_twiddle_stride2_256_q31[168];
+extern q31_t rearranged_twiddle_stride3_256_q31[168];
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_1024) || defined(ARM_TABLE_TWIDDLECOEF_Q31_2048)
+
+extern uint32_t rearranged_twiddle_tab_stride1_arr_1024_q31[5];
+extern uint32_t rearranged_twiddle_tab_stride2_arr_1024_q31[5];
+extern uint32_t rearranged_twiddle_tab_stride3_arr_1024_q31[5];
+extern q31_t rearranged_twiddle_stride1_1024_q31[680];
+extern q31_t rearranged_twiddle_stride2_1024_q31[680];
+extern q31_t rearranged_twiddle_stride3_1024_q31[680];
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_4096) || defined(ARM_TABLE_TWIDDLECOEF_Q31_8192)
+
+extern uint32_t rearranged_twiddle_tab_stride1_arr_4096_q31[6];
+extern uint32_t rearranged_twiddle_tab_stride2_arr_4096_q31[6];
+extern uint32_t rearranged_twiddle_tab_stride3_arr_4096_q31[6];
+extern q31_t rearranged_twiddle_stride1_4096_q31[2728];
+extern q31_t rearranged_twiddle_stride2_4096_q31[2728];
+extern q31_t rearranged_twiddle_stride3_4096_q31[2728];
+#endif
+
+
+#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) */
+
+#endif /* defined(ARM_MATH_MVEI) */
+
+
+
+#if defined(ARM_MATH_MVEI)
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_16) || defined(ARM_TABLE_TWIDDLECOEF_Q15_32)
+
+extern uint32_t rearranged_twiddle_tab_stride1_arr_16_q15[2];
+extern uint32_t rearranged_twiddle_tab_stride2_arr_16_q15[2];
+extern uint32_t rearranged_twiddle_tab_stride3_arr_16_q15[2];
+extern q15_t rearranged_twiddle_stride1_16_q15[8];
+extern q15_t rearranged_twiddle_stride2_16_q15[8];
+extern q15_t rearranged_twiddle_stride3_16_q15[8];
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_64) || defined(ARM_TABLE_TWIDDLECOEF_Q15_128)
+
+extern uint32_t rearranged_twiddle_tab_stride1_arr_64_q15[3];
+extern uint32_t rearranged_twiddle_tab_stride2_arr_64_q15[3];
+extern uint32_t rearranged_twiddle_tab_stride3_arr_64_q15[3];
+extern q15_t rearranged_twiddle_stride1_64_q15[40];
+extern q15_t rearranged_twiddle_stride2_64_q15[40];
+extern q15_t rearranged_twiddle_stride3_64_q15[40];
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_256) || defined(ARM_TABLE_TWIDDLECOEF_Q15_512)
+
+extern uint32_t rearranged_twiddle_tab_stride1_arr_256_q15[4];
+extern uint32_t rearranged_twiddle_tab_stride2_arr_256_q15[4];
+extern uint32_t rearranged_twiddle_tab_stride3_arr_256_q15[4];
+extern q15_t rearranged_twiddle_stride1_256_q15[168];
+extern q15_t rearranged_twiddle_stride2_256_q15[168];
+extern q15_t rearranged_twiddle_stride3_256_q15[168];
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_1024) || defined(ARM_TABLE_TWIDDLECOEF_Q15_2048)
+
+extern uint32_t rearranged_twiddle_tab_stride1_arr_1024_q15[5];
+extern uint32_t rearranged_twiddle_tab_stride2_arr_1024_q15[5];
+extern uint32_t rearranged_twiddle_tab_stride3_arr_1024_q15[5];
+extern q15_t rearranged_twiddle_stride1_1024_q15[680];
+extern q15_t rearranged_twiddle_stride2_1024_q15[680];
+extern q15_t rearranged_twiddle_stride3_1024_q15[680];
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_4096) || defined(ARM_TABLE_TWIDDLECOEF_Q15_8192)
+
+extern uint32_t rearranged_twiddle_tab_stride1_arr_4096_q15[6];
+extern uint32_t rearranged_twiddle_tab_stride2_arr_4096_q15[6];
+extern uint32_t rearranged_twiddle_tab_stride3_arr_4096_q15[6];
+extern q15_t rearranged_twiddle_stride1_4096_q15[2728];
+extern q15_t rearranged_twiddle_stride2_4096_q15[2728];
+extern q15_t rearranged_twiddle_stride3_4096_q15[2728];
+#endif
+
+
+#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) */
+
+#endif /* defined(ARM_MATH_MVEI) */
+
+
+
+#if defined(ARM_MATH_MVEI)
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
+
+
+#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) */
+
+#endif /* defined(ARM_MATH_MVEI) */
+
+
+
+#endif /*_ARM_MVE_TABLES_H*/
+

+ 372 - 0
libraries/cmsis/cm4/core_support/arm_vec_math.h

@@ -0,0 +1,372 @@
+/******************************************************************************
+ * @file     arm_vec_math.h
+ * @brief    Public header file for CMSIS DSP Library
+ * @version  V1.7.0
+ * @date     15. October 2019
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2019 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _ARM_VEC_MATH_H
+#define _ARM_VEC_MATH_H
+
+#include "arm_math.h"
+#include "arm_common_tables.h"
+#include "arm_helium_utils.h"
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+#if (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#define INV_NEWTON_INIT_F32         0x7EF127EA
+
+static const float32_t __logf_rng_f32=0.693147180f;
+
+
+/* fast inverse approximation (3x newton) */
+__STATIC_INLINE f32x4_t vrecip_medprec_f32(
+    f32x4_t x)
+{
+    q31x4_t         m;
+    f32x4_t         b;
+    any32x4_t       xinv;
+    f32x4_t         ax = vabsq(x);
+
+    xinv.f = ax;
+    m = 0x3F800000 - (xinv.i & 0x7F800000);
+    xinv.i = xinv.i + m;
+    xinv.f = 1.41176471f - 0.47058824f * xinv.f;
+    xinv.i = xinv.i + m;
+
+    b = 2.0f - xinv.f * ax;
+    xinv.f = xinv.f * b;
+
+    b = 2.0f - xinv.f * ax;
+    xinv.f = xinv.f * b;
+
+    b = 2.0f - xinv.f * ax;
+    xinv.f = xinv.f * b;
+
+    xinv.f = vdupq_m(xinv.f, INFINITY, vcmpeqq(x, 0.0f));
+    /*
+     * restore sign
+     */
+    xinv.f = vnegq_m(xinv.f, xinv.f, vcmpltq(x, 0.0f));
+
+    return xinv.f;
+}
+
+/* fast inverse approximation (4x newton) */
+__STATIC_INLINE f32x4_t vrecip_hiprec_f32(
+    f32x4_t x)
+{
+    q31x4_t         m;
+    f32x4_t         b;
+    any32x4_t       xinv;
+    f32x4_t         ax = vabsq(x);
+
+    xinv.f = ax;
+
+    m = 0x3F800000 - (xinv.i & 0x7F800000);
+    xinv.i = xinv.i + m;
+    xinv.f = 1.41176471f - 0.47058824f * xinv.f;
+    xinv.i = xinv.i + m;
+
+    b = 2.0f - xinv.f * ax;
+    xinv.f = xinv.f * b;
+
+    b = 2.0f - xinv.f * ax;
+    xinv.f = xinv.f * b;
+
+    b = 2.0f - xinv.f * ax;
+    xinv.f = xinv.f * b;
+
+    b = 2.0f - xinv.f * ax;
+    xinv.f = xinv.f * b;
+
+    xinv.f = vdupq_m(xinv.f, INFINITY, vcmpeqq(x, 0.0f));
+    /*
+     * restore sign
+     */
+    xinv.f = vnegq_m(xinv.f, xinv.f, vcmpltq(x, 0.0f));
+
+    return xinv.f;
+}
+
+__STATIC_INLINE f32x4_t vdiv_f32(
+    f32x4_t num, f32x4_t den)
+{
+    return vmulq(num, vrecip_hiprec_f32(den));
+}
+
+/**
+  @brief         Single-precision taylor dev.
+  @param[in]     x              f32 quad vector input
+  @param[in]     coeffs         f32 quad vector coeffs
+  @return        destination    f32 quad vector
+ */
+
+__STATIC_INLINE f32x4_t vtaylor_polyq_f32(
+        f32x4_t           x,
+        const float32_t * coeffs)
+{
+    f32x4_t         A = vfmasq(vdupq_n_f32(coeffs[4]), x, coeffs[0]);
+    f32x4_t         B = vfmasq(vdupq_n_f32(coeffs[6]), x, coeffs[2]);
+    f32x4_t         C = vfmasq(vdupq_n_f32(coeffs[5]), x, coeffs[1]);
+    f32x4_t         D = vfmasq(vdupq_n_f32(coeffs[7]), x, coeffs[3]);
+    f32x4_t         x2 = vmulq(x, x);
+    f32x4_t         x4 = vmulq(x2, x2);
+    f32x4_t         res = vfmaq(vfmaq_f32(A, B, x2), vfmaq_f32(C, D, x2), x4);
+
+    return res;
+}
+
+__STATIC_INLINE f32x4_t vmant_exp_f32(
+    f32x4_t     x,
+    int32x4_t * e)
+{
+    any32x4_t       r;
+    int32x4_t       n;
+
+    r.f = x;
+    n = r.i >> 23;
+    n = n - 127;
+    r.i = r.i - (n << 23);
+
+    *e = n;
+    return r.f;
+}
+
+
+__STATIC_INLINE f32x4_t vlogq_f32(f32x4_t vecIn)
+{
+    q31x4_t         vecExpUnBiased;
+    f32x4_t         vecTmpFlt0, vecTmpFlt1;
+    f32x4_t         vecAcc0, vecAcc1, vecAcc2, vecAcc3;
+    f32x4_t         vecExpUnBiasedFlt;
+
+    /*
+     * extract exponent
+     */
+    vecTmpFlt1 = vmant_exp_f32(vecIn, &vecExpUnBiased);
+
+    vecTmpFlt0 = vecTmpFlt1 * vecTmpFlt1;
+    /*
+     * a = (__logf_lut_f32[4] * r.f) + (__logf_lut_f32[0]);
+     */
+    vecAcc0 = vdupq_n_f32(__logf_lut_f32[0]);
+    vecAcc0 = vfmaq(vecAcc0, vecTmpFlt1, __logf_lut_f32[4]);
+    /*
+     * b = (__logf_lut_f32[6] * r.f) + (__logf_lut_f32[2]);
+     */
+    vecAcc1 = vdupq_n_f32(__logf_lut_f32[2]);
+    vecAcc1 = vfmaq(vecAcc1, vecTmpFlt1, __logf_lut_f32[6]);
+    /*
+     * c = (__logf_lut_f32[5] * r.f) + (__logf_lut_f32[1]);
+     */
+    vecAcc2 = vdupq_n_f32(__logf_lut_f32[1]);
+    vecAcc2 = vfmaq(vecAcc2, vecTmpFlt1, __logf_lut_f32[5]);
+    /*
+     * d = (__logf_lut_f32[7] * r.f) + (__logf_lut_f32[3]);
+     */
+    vecAcc3 = vdupq_n_f32(__logf_lut_f32[3]);
+    vecAcc3 = vfmaq(vecAcc3, vecTmpFlt1, __logf_lut_f32[7]);
+    /*
+     * a = a + b * xx;
+     */
+    vecAcc0 = vfmaq(vecAcc0, vecAcc1, vecTmpFlt0);
+    /*
+     * c = c + d * xx;
+     */
+    vecAcc2 = vfmaq(vecAcc2, vecAcc3, vecTmpFlt0);
+    /*
+     * xx = xx * xx;
+     */
+    vecTmpFlt0 = vecTmpFlt0 * vecTmpFlt0;
+    vecExpUnBiasedFlt = vcvtq_f32_s32(vecExpUnBiased);
+    /*
+     * r.f = a + c * xx;
+     */
+    vecAcc0 = vfmaq(vecAcc0, vecAcc2, vecTmpFlt0);
+    /*
+     * add exponent
+     * r.f = r.f + ((float32_t) m) * __logf_rng_f32;
+     */
+    vecAcc0 = vfmaq(vecAcc0, vecExpUnBiasedFlt, __logf_rng_f32);
+    // set log0 down to -inf
+    vecAcc0 = vdupq_m(vecAcc0, -INFINITY, vcmpeqq(vecIn, 0.0f));
+    return vecAcc0;
+}
+
+__STATIC_INLINE f32x4_t vexpq_f32(
+    f32x4_t x)
+{
+    // Perform range reduction [-log(2),log(2)]
+    int32x4_t       m = vcvtq_s32_f32(vmulq_n_f32(x, 1.4426950408f));
+    f32x4_t         val = vfmsq_f32(x, vcvtq_f32_s32(m), vdupq_n_f32(0.6931471805f));
+
+    // Polynomial Approximation
+    f32x4_t         poly = vtaylor_polyq_f32(val, exp_tab);
+
+    // Reconstruct
+    poly = (f32x4_t) (vqaddq_s32((q31x4_t) (poly), vqshlq_n_s32(m, 23)));
+
+    poly = vdupq_m(poly, 0.0f, vcmpltq_n_s32(m, -126));
+    return poly;
+}
+
+__STATIC_INLINE f32x4_t arm_vec_exponent_f32(f32x4_t x, int32_t nb)
+{
+    f32x4_t         r = x;
+    nb--;
+    while (nb > 0) {
+        r = vmulq(r, x);
+        nb--;
+    }
+    return (r);
+}
+
+__STATIC_INLINE f32x4_t vrecip_f32(f32x4_t vecIn)
+{
+    f32x4_t     vecSx, vecW, vecTmp;
+    any32x4_t   v;
+
+    vecSx = vabsq(vecIn);
+
+    v.f = vecIn;
+    v.i = vsubq(vdupq_n_s32(INV_NEWTON_INIT_F32), v.i);
+
+    vecW = vmulq(vecSx, v.f);
+
+    // v.f = v.f * (8 + w * (-28 + w * (56 + w * (-70 + w *(56 + w * (-28 + w * (8 - w)))))));
+    vecTmp = vsubq(vdupq_n_f32(8.0f), vecW);
+    vecTmp = vfmasq(vecW, vecTmp, -28.0f);
+    vecTmp = vfmasq(vecW, vecTmp, 56.0f);
+    vecTmp = vfmasq(vecW, vecTmp, -70.0f);
+    vecTmp = vfmasq(vecW, vecTmp, 56.0f);
+    vecTmp = vfmasq(vecW, vecTmp, -28.0f);
+    vecTmp = vfmasq(vecW, vecTmp, 8.0f);
+    v.f = vmulq(v.f,  vecTmp);
+
+    v.f = vdupq_m(v.f, INFINITY, vcmpeqq(vecIn, 0.0f));
+    /*
+     * restore sign
+     */
+    v.f = vnegq_m(v.f, v.f, vcmpltq(vecIn, 0.0f));
+    return v.f;
+}
+
+__STATIC_INLINE f32x4_t vtanhq_f32(
+    f32x4_t val)
+{
+    f32x4_t         x =
+        vminnmq_f32(vmaxnmq_f32(val, vdupq_n_f32(-10.f)), vdupq_n_f32(10.0f));
+    f32x4_t         exp2x = vexpq_f32(vmulq_n_f32(x, 2.f));
+    f32x4_t         num = vsubq_n_f32(exp2x, 1.f);
+    f32x4_t         den = vaddq_n_f32(exp2x, 1.f);
+    f32x4_t         tanh = vmulq_f32(num, vrecip_f32(den));
+    return tanh;
+}
+
+__STATIC_INLINE f32x4_t vpowq_f32(
+    f32x4_t val,
+    f32x4_t n)
+{
+    return vexpq_f32(vmulq_f32(n, vlogq_f32(val)));
+}
+
+#endif /* (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)*/
+
+#if (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM))
+#endif /* (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM)) */
+
+#if (defined(ARM_MATH_NEON) || defined(ARM_MATH_NEON_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "NEMath.h"
+/**
+ * @brief Vectorized integer exponentiation
+ * @param[in]    x           value
+ * @param[in]    nb          integer exponent >= 1
+ * @return x^nb
+ *
+ */
+__STATIC_INLINE  float32x4_t arm_vec_exponent_f32(float32x4_t x, int32_t nb)
+{
+    float32x4_t r = x;
+    nb --;
+    while(nb > 0)
+    {
+        r = vmulq_f32(r , x);
+        nb--;
+    }
+    return(r);
+}
+
+
+__STATIC_INLINE float32x4_t __arm_vec_sqrt_f32_neon(float32x4_t  x)
+{
+    float32x4_t x1 = vmaxq_f32(x, vdupq_n_f32(FLT_MIN));
+    float32x4_t e = vrsqrteq_f32(x1);
+    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
+    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
+    return vmulq_f32(x, e);
+}
+
+__STATIC_INLINE int16x8_t __arm_vec_sqrt_q15_neon(int16x8_t vec)
+{
+    float32x4_t tempF;
+    int32x4_t tempHI,tempLO;
+
+    tempLO = vmovl_s16(vget_low_s16(vec));
+    tempF = vcvtq_n_f32_s32(tempLO,15);
+    tempF = __arm_vec_sqrt_f32_neon(tempF);
+    tempLO = vcvtq_n_s32_f32(tempF,15);
+
+    tempHI = vmovl_s16(vget_high_s16(vec));
+    tempF = vcvtq_n_f32_s32(tempHI,15);
+    tempF = __arm_vec_sqrt_f32_neon(tempF);
+    tempHI = vcvtq_n_s32_f32(tempF,15);
+
+    return(vcombine_s16(vqmovn_s32(tempLO),vqmovn_s32(tempHI)));
+}
+
+__STATIC_INLINE int32x4_t __arm_vec_sqrt_q31_neon(int32x4_t vec)
+{
+  float32x4_t temp;
+
+  temp = vcvtq_n_f32_s32(vec,31);
+  temp = __arm_vec_sqrt_f32_neon(temp);
+  return(vcvtq_n_s32_f32(temp,31));
+}
+
+#endif /*  (defined(ARM_MATH_NEON) || defined(ARM_MATH_NEON_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+#ifdef   __cplusplus
+}
+#endif
+
+
+#endif /* _ARM_VEC_MATH_H */
+
+/**
+ *
+ * End of file.
+ */

+ 885 - 0
libraries/cmsis/cm4/core_support/cmsis_armcc.h

@@ -0,0 +1,885 @@
+/******************************************************************************
+ * @file     cmsis_armcc.h
+ * @brief    CMSIS compiler ARMCC (Arm Compiler 5) header file
+ * @version  V5.2.1
+ * @date     26. March 2020
+ ******************************************************************************/
+/*
+ * Copyright (c) 2009-2020 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __CMSIS_ARMCC_H
+#define __CMSIS_ARMCC_H
+
+
+#if defined(__ARMCC_VERSION) && (__ARMCC_VERSION < 400677)
+  #error "Please use Arm Compiler Toolchain V4.0.677 or later!"
+#endif
+
+/* CMSIS compiler control architecture macros */
+#if ((defined (__TARGET_ARCH_6_M  ) && (__TARGET_ARCH_6_M   == 1)) || \
+     (defined (__TARGET_ARCH_6S_M ) && (__TARGET_ARCH_6S_M  == 1))   )
+  #define __ARM_ARCH_6M__           1
+#endif
+
+#if (defined (__TARGET_ARCH_7_M ) && (__TARGET_ARCH_7_M  == 1))
+  #define __ARM_ARCH_7M__           1
+#endif
+
+#if (defined (__TARGET_ARCH_7E_M) && (__TARGET_ARCH_7E_M == 1))
+  #define __ARM_ARCH_7EM__          1
+#endif
+
+  /* __ARM_ARCH_8M_BASE__  not applicable */
+  /* __ARM_ARCH_8M_MAIN__  not applicable */
+  /* __ARM_ARCH_8_1M_MAIN__  not applicable */
+
+/* CMSIS compiler control DSP macros */
+#if ((defined (__ARM_ARCH_7EM__) && (__ARM_ARCH_7EM__ == 1))     )
+  #define __ARM_FEATURE_DSP         1
+#endif
+
+/* CMSIS compiler specific defines */
+#ifndef   __ASM
+  #define __ASM                                  __asm
+#endif
+#ifndef   __INLINE
+  #define __INLINE                               __inline
+#endif
+#ifndef   __STATIC_INLINE
+  #define __STATIC_INLINE                        static __inline
+#endif
+#ifndef   __STATIC_FORCEINLINE
+  #define __STATIC_FORCEINLINE                   static __forceinline
+#endif
+#ifndef   __NO_RETURN
+  #define __NO_RETURN                            __declspec(noreturn)
+#endif
+#ifndef   __USED
+  #define __USED                                 __attribute__((used))
+#endif
+#ifndef   __WEAK
+  #define __WEAK                                 __attribute__((weak))
+#endif
+#ifndef   __PACKED
+  #define __PACKED                               __attribute__((packed))
+#endif
+#ifndef   __PACKED_STRUCT
+  #define __PACKED_STRUCT                        __packed struct
+#endif
+#ifndef   __PACKED_UNION
+  #define __PACKED_UNION                         __packed union
+#endif
+#ifndef   __UNALIGNED_UINT32        /* deprecated */
+  #define __UNALIGNED_UINT32(x)                  (*((__packed uint32_t *)(x)))
+#endif
+#ifndef   __UNALIGNED_UINT16_WRITE
+  #define __UNALIGNED_UINT16_WRITE(addr, val)    ((*((__packed uint16_t *)(addr))) = (val))
+#endif
+#ifndef   __UNALIGNED_UINT16_READ
+  #define __UNALIGNED_UINT16_READ(addr)          (*((const __packed uint16_t *)(addr)))
+#endif
+#ifndef   __UNALIGNED_UINT32_WRITE
+  #define __UNALIGNED_UINT32_WRITE(addr, val)    ((*((__packed uint32_t *)(addr))) = (val))
+#endif
+#ifndef   __UNALIGNED_UINT32_READ
+  #define __UNALIGNED_UINT32_READ(addr)          (*((const __packed uint32_t *)(addr)))
+#endif
+#ifndef   __ALIGNED
+  #define __ALIGNED(x)                           __attribute__((aligned(x)))
+#endif
+#ifndef   __RESTRICT
+  #define __RESTRICT                             __restrict
+#endif
+#ifndef   __COMPILER_BARRIER
+  #define __COMPILER_BARRIER()                   __memory_changed()
+#endif
+
+/* #########################  Startup and Lowlevel Init  ######################## */
+
+#ifndef __PROGRAM_START
+#define __PROGRAM_START           __main
+#endif
+
+#ifndef __INITIAL_SP
+#define __INITIAL_SP              Image$$ARM_LIB_STACK$$ZI$$Limit
+#endif
+
+#ifndef __STACK_LIMIT
+#define __STACK_LIMIT             Image$$ARM_LIB_STACK$$ZI$$Base
+#endif
+
+#ifndef __VECTOR_TABLE
+#define __VECTOR_TABLE            __Vectors
+#endif
+
+#ifndef __VECTOR_TABLE_ATTRIBUTE
+#define __VECTOR_TABLE_ATTRIBUTE  __attribute__((used, section("RESET")))
+#endif
+
+/* ###########################  Core Function Access  ########################### */
+/** \ingroup  CMSIS_Core_FunctionInterface
+    \defgroup CMSIS_Core_RegAccFunctions CMSIS Core Register Access Functions
+  @{
+ */
+
+/**
+  \brief   Enable IRQ Interrupts
+  \details Enables IRQ interrupts by clearing the I-bit in the CPSR.
+           Can only be executed in Privileged modes.
+ */
+/* intrinsic void __enable_irq();     */
+
+
+/**
+  \brief   Disable IRQ Interrupts
+  \details Disables IRQ interrupts by setting the I-bit in the CPSR.
+           Can only be executed in Privileged modes.
+ */
+/* intrinsic void __disable_irq();    */
+
+/**
+  \brief   Get Control Register
+  \details Returns the content of the Control Register.
+  \return               Control Register value
+ */
+__STATIC_INLINE uint32_t __get_CONTROL(void)
+{
+  register uint32_t __regControl         __ASM("control");
+  return(__regControl);
+}
+
+
+/**
+  \brief   Set Control Register
+  \details Writes the given value to the Control Register.
+  \param [in]    control  Control Register value to set
+ */
+__STATIC_INLINE void __set_CONTROL(uint32_t control)
+{
+  register uint32_t __regControl         __ASM("control");
+  __regControl = control;
+}
+
+
+/**
+  \brief   Get IPSR Register
+  \details Returns the content of the IPSR Register.
+  \return               IPSR Register value
+ */
+__STATIC_INLINE uint32_t __get_IPSR(void)
+{
+  register uint32_t __regIPSR          __ASM("ipsr");
+  return(__regIPSR);
+}
+
+
+/**
+  \brief   Get APSR Register
+  \details Returns the content of the APSR Register.
+  \return               APSR Register value
+ */
+__STATIC_INLINE uint32_t __get_APSR(void)
+{
+  register uint32_t __regAPSR          __ASM("apsr");
+  return(__regAPSR);
+}
+
+
+/**
+  \brief   Get xPSR Register
+  \details Returns the content of the xPSR Register.
+  \return               xPSR Register value
+ */
+__STATIC_INLINE uint32_t __get_xPSR(void)
+{
+  register uint32_t __regXPSR          __ASM("xpsr");
+  return(__regXPSR);
+}
+
+
+/**
+  \brief   Get Process Stack Pointer
+  \details Returns the current value of the Process Stack Pointer (PSP).
+  \return               PSP Register value
+ */
+__STATIC_INLINE uint32_t __get_PSP(void)
+{
+  register uint32_t __regProcessStackPointer  __ASM("psp");
+  return(__regProcessStackPointer);
+}
+
+
+/**
+  \brief   Set Process Stack Pointer
+  \details Assigns the given value to the Process Stack Pointer (PSP).
+  \param [in]    topOfProcStack  Process Stack Pointer value to set
+ */
+__STATIC_INLINE void __set_PSP(uint32_t topOfProcStack)
+{
+  register uint32_t __regProcessStackPointer  __ASM("psp");
+  __regProcessStackPointer = topOfProcStack;
+}
+
+
+/**
+  \brief   Get Main Stack Pointer
+  \details Returns the current value of the Main Stack Pointer (MSP).
+  \return               MSP Register value
+ */
+__STATIC_INLINE uint32_t __get_MSP(void)
+{
+  register uint32_t __regMainStackPointer     __ASM("msp");
+  return(__regMainStackPointer);
+}
+
+
+/**
+  \brief   Set Main Stack Pointer
+  \details Assigns the given value to the Main Stack Pointer (MSP).
+  \param [in]    topOfMainStack  Main Stack Pointer value to set
+ */
+__STATIC_INLINE void __set_MSP(uint32_t topOfMainStack)
+{
+  register uint32_t __regMainStackPointer     __ASM("msp");
+  __regMainStackPointer = topOfMainStack;
+}
+
+
+/**
+  \brief   Get Priority Mask
+  \details Returns the current state of the priority mask bit from the Priority Mask Register.
+  \return               Priority Mask value
+ */
+__STATIC_INLINE uint32_t __get_PRIMASK(void)
+{
+  register uint32_t __regPriMask         __ASM("primask");
+  return(__regPriMask);
+}
+
+
+/**
+  \brief   Set Priority Mask
+  \details Assigns the given value to the Priority Mask Register.
+  \param [in]    priMask  Priority Mask
+ */
+__STATIC_INLINE void __set_PRIMASK(uint32_t priMask)
+{
+  register uint32_t __regPriMask         __ASM("primask");
+  __regPriMask = (priMask);
+}
+
+
+#if ((defined (__ARM_ARCH_7M__ ) && (__ARM_ARCH_7M__  == 1)) || \
+     (defined (__ARM_ARCH_7EM__) && (__ARM_ARCH_7EM__ == 1))     )
+
+/**
+  \brief   Enable FIQ
+  \details Enables FIQ interrupts by clearing the F-bit in the CPSR.
+           Can only be executed in Privileged modes.
+ */
+#define __enable_fault_irq                __enable_fiq
+
+
+/**
+  \brief   Disable FIQ
+  \details Disables FIQ interrupts by setting the F-bit in the CPSR.
+           Can only be executed in Privileged modes.
+ */
+#define __disable_fault_irq               __disable_fiq
+
+
+/**
+  \brief   Get Base Priority
+  \details Returns the current value of the Base Priority register.
+  \return               Base Priority register value
+ */
+__STATIC_INLINE uint32_t  __get_BASEPRI(void)
+{
+  register uint32_t __regBasePri         __ASM("basepri");
+  return(__regBasePri);
+}
+
+
+/**
+  \brief   Set Base Priority
+  \details Assigns the given value to the Base Priority register.
+  \param [in]    basePri  Base Priority value to set
+ */
+__STATIC_INLINE void __set_BASEPRI(uint32_t basePri)
+{
+  register uint32_t __regBasePri         __ASM("basepri");
+  __regBasePri = (basePri & 0xFFU);
+}
+
+
+/**
+  \brief   Set Base Priority with condition
+  \details Assigns the given value to the Base Priority register only if BASEPRI masking is disabled,
+           or the new value increases the BASEPRI priority level.
+  \param [in]    basePri  Base Priority value to set
+ */
+__STATIC_INLINE void __set_BASEPRI_MAX(uint32_t basePri)
+{
+  register uint32_t __regBasePriMax      __ASM("basepri_max");
+  __regBasePriMax = (basePri & 0xFFU);
+}
+
+
+/**
+  \brief   Get Fault Mask
+  \details Returns the current value of the Fault Mask register.
+  \return               Fault Mask register value
+ */
+__STATIC_INLINE uint32_t __get_FAULTMASK(void)
+{
+  register uint32_t __regFaultMask       __ASM("faultmask");
+  return(__regFaultMask);
+}
+
+
+/**
+  \brief   Set Fault Mask
+  \details Assigns the given value to the Fault Mask register.
+  \param [in]    faultMask  Fault Mask value to set
+ */
+__STATIC_INLINE void __set_FAULTMASK(uint32_t faultMask)
+{
+  register uint32_t __regFaultMask       __ASM("faultmask");
+  __regFaultMask = (faultMask & (uint32_t)1U);
+}
+
+#endif /* ((defined (__ARM_ARCH_7M__ ) && (__ARM_ARCH_7M__  == 1)) || \
+           (defined (__ARM_ARCH_7EM__) && (__ARM_ARCH_7EM__ == 1))     ) */
+
+
+/**
+  \brief   Get FPSCR
+  \details Returns the current value of the Floating Point Status/Control register.
+  \return               Floating Point Status/Control register value
+ */
+__STATIC_INLINE uint32_t __get_FPSCR(void)
+{
+#if ((defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)) && \
+     (defined (__FPU_USED   ) && (__FPU_USED    == 1U))     )
+  register uint32_t __regfpscr         __ASM("fpscr");
+  return(__regfpscr);
+#else
+   return(0U);
+#endif
+}
+
+
+/**
+  \brief   Set FPSCR
+  \details Assigns the given value to the Floating Point Status/Control register.
+  \param [in]    fpscr  Floating Point Status/Control value to set
+ */
+__STATIC_INLINE void __set_FPSCR(uint32_t fpscr)
+{
+#if ((defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)) && \
+     (defined (__FPU_USED   ) && (__FPU_USED    == 1U))     )
+  register uint32_t __regfpscr         __ASM("fpscr");
+  __regfpscr = (fpscr);
+#else
+  (void)fpscr;
+#endif
+}
+
+
+/*@} end of CMSIS_Core_RegAccFunctions */
+
+
+/* ##########################  Core Instruction Access  ######################### */
+/** \defgroup CMSIS_Core_InstructionInterface CMSIS Core Instruction Interface
+  Access to dedicated instructions
+  @{
+*/
+
+/**
+  \brief   No Operation
+  \details No Operation does nothing. This instruction can be used for code alignment purposes.
+ */
+#define __NOP                             __nop
+
+
+/**
+  \brief   Wait For Interrupt
+  \details Wait For Interrupt is a hint instruction that suspends execution until one of a number of events occurs.
+ */
+#define __WFI                             __wfi
+
+
+/**
+  \brief   Wait For Event
+  \details Wait For Event is a hint instruction that permits the processor to enter
+           a low-power state until one of a number of events occurs.
+ */
+#define __WFE                             __wfe
+
+
+/**
+  \brief   Send Event
+  \details Send Event is a hint instruction. It causes an event to be signaled to the CPU.
+ */
+#define __SEV                             __sev
+
+
+/**
+  \brief   Instruction Synchronization Barrier
+  \details Instruction Synchronization Barrier flushes the pipeline in the processor,
+           so that all instructions following the ISB are fetched from cache or memory,
+           after the instruction has been completed.
+ */
+#define __ISB()                           __isb(0xF)
+
+/**
+  \brief   Data Synchronization Barrier
+  \details Acts as a special kind of Data Memory Barrier.
+           It completes when all explicit memory accesses before this instruction complete.
+ */
+#define __DSB()                           __dsb(0xF)
+
+/**
+  \brief   Data Memory Barrier
+  \details Ensures the apparent order of the explicit memory operations before
+           and after the instruction, without ensuring their completion.
+ */
+#define __DMB()                           __dmb(0xF)
+
+
+/**
+  \brief   Reverse byte order (32 bit)
+  \details Reverses the byte order in unsigned integer value. For example, 0x12345678 becomes 0x78563412.
+  \param [in]    value  Value to reverse
+  \return               Reversed value
+ */
+#define __REV                             __rev
+
+
+/**
+  \brief   Reverse byte order (16 bit)
+  \details Reverses the byte order within each halfword of a word. For example, 0x12345678 becomes 0x34127856.
+  \param [in]    value  Value to reverse
+  \return               Reversed value
+ */
+#ifndef __NO_EMBEDDED_ASM
+__attribute__((section(".rev16_text"))) __STATIC_INLINE __ASM uint32_t __REV16(uint32_t value)
+{
+  rev16 r0, r0
+  bx lr
+}
+#endif
+
+
+/**
+  \brief   Reverse byte order (16 bit)
+  \details Reverses the byte order in a 16-bit value and returns the signed 16-bit result. For example, 0x0080 becomes 0x8000.
+  \param [in]    value  Value to reverse
+  \return               Reversed value
+ */
+#ifndef __NO_EMBEDDED_ASM
+__attribute__((section(".revsh_text"))) __STATIC_INLINE __ASM int16_t __REVSH(int16_t value)
+{
+  revsh r0, r0
+  bx lr
+}
+#endif
+
+
+/**
+  \brief   Rotate Right in unsigned value (32 bit)
+  \details Rotate Right (immediate) provides the value of the contents of a register rotated by a variable number of bits.
+  \param [in]    op1  Value to rotate
+  \param [in]    op2  Number of Bits to rotate
+  \return               Rotated value
+ */
+#define __ROR                             __ror
+
+
+/**
+  \brief   Breakpoint
+  \details Causes the processor to enter Debug state.
+           Debug tools can use this to investigate system state when the instruction at a particular address is reached.
+  \param [in]    value  is ignored by the processor.
+                 If required, a debugger can use it to store additional information about the breakpoint.
+ */
+#define __BKPT(value)                       __breakpoint(value)
+
+
+/**
+  \brief   Reverse bit order of value
+  \details Reverses the bit order of the given value.
+  \param [in]    value  Value to reverse
+  \return               Reversed value
+ */
+#if ((defined (__ARM_ARCH_7M__ ) && (__ARM_ARCH_7M__  == 1)) || \
+     (defined (__ARM_ARCH_7EM__) && (__ARM_ARCH_7EM__ == 1))     )
+  #define __RBIT                          __rbit
+#else
+__attribute__((always_inline)) __STATIC_INLINE uint32_t __RBIT(uint32_t value)
+{
+  uint32_t result;
+  uint32_t s = (4U /*sizeof(v)*/ * 8U) - 1U; /* extra shift needed at end */
+
+  result = value;                      /* r will be reversed bits of v; first get LSB of v */
+  for (value >>= 1U; value != 0U; value >>= 1U)
+  {
+    result <<= 1U;
+    result |= value & 1U;
+    s--;
+  }
+  result <<= s;                        /* shift when v's highest bits are zero */
+  return result;
+}
+#endif
+
+
+/**
+  \brief   Count leading zeros
+  \details Counts the number of leading zeros of a data value.
+  \param [in]  value  Value to count the leading zeros
+  \return             number of leading zeros in value
+ */
+#define __CLZ                             __clz
+
+
+#if ((defined (__ARM_ARCH_7M__ ) && (__ARM_ARCH_7M__  == 1)) || \
+     (defined (__ARM_ARCH_7EM__) && (__ARM_ARCH_7EM__ == 1))     )
+
+/**
+  \brief   LDR Exclusive (8 bit)
+  \details Executes a exclusive LDR instruction for 8 bit value.
+  \param [in]    ptr  Pointer to data
+  \return             value of type uint8_t at (*ptr)
+ */
+#if defined(__ARMCC_VERSION) && (__ARMCC_VERSION < 5060020)
+  #define __LDREXB(ptr)                                                        ((uint8_t ) __ldrex(ptr))
+#else
+  #define __LDREXB(ptr)          _Pragma("push") _Pragma("diag_suppress 3731") ((uint8_t ) __ldrex(ptr))  _Pragma("pop")
+#endif
+
+
+/**
+  \brief   LDR Exclusive (16 bit)
+  \details Executes a exclusive LDR instruction for 16 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint16_t at (*ptr)
+ */
+#if defined(__ARMCC_VERSION) && (__ARMCC_VERSION < 5060020)
+  #define __LDREXH(ptr)                                                        ((uint16_t) __ldrex(ptr))
+#else
+  #define __LDREXH(ptr)          _Pragma("push") _Pragma("diag_suppress 3731") ((uint16_t) __ldrex(ptr))  _Pragma("pop")
+#endif
+
+
+/**
+  \brief   LDR Exclusive (32 bit)
+  \details Executes a exclusive LDR instruction for 32 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint32_t at (*ptr)
+ */
+#if defined(__ARMCC_VERSION) && (__ARMCC_VERSION < 5060020)
+  #define __LDREXW(ptr)                                                        ((uint32_t ) __ldrex(ptr))
+#else
+  #define __LDREXW(ptr)          _Pragma("push") _Pragma("diag_suppress 3731") ((uint32_t ) __ldrex(ptr))  _Pragma("pop")
+#endif
+
+
+/**
+  \brief   STR Exclusive (8 bit)
+  \details Executes a exclusive STR instruction for 8 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+  \return          0  Function succeeded
+  \return          1  Function failed
+ */
+#if defined(__ARMCC_VERSION) && (__ARMCC_VERSION < 5060020)
+  #define __STREXB(value, ptr)                                                 __strex(value, ptr)
+#else
+  #define __STREXB(value, ptr)   _Pragma("push") _Pragma("diag_suppress 3731") __strex(value, ptr)        _Pragma("pop")
+#endif
+
+
+/**
+  \brief   STR Exclusive (16 bit)
+  \details Executes a exclusive STR instruction for 16 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+  \return          0  Function succeeded
+  \return          1  Function failed
+ */
+#if defined(__ARMCC_VERSION) && (__ARMCC_VERSION < 5060020)
+  #define __STREXH(value, ptr)                                                 __strex(value, ptr)
+#else
+  #define __STREXH(value, ptr)   _Pragma("push") _Pragma("diag_suppress 3731") __strex(value, ptr)        _Pragma("pop")
+#endif
+
+
+/**
+  \brief   STR Exclusive (32 bit)
+  \details Executes a exclusive STR instruction for 32 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+  \return          0  Function succeeded
+  \return          1  Function failed
+ */
+#if defined(__ARMCC_VERSION) && (__ARMCC_VERSION < 5060020)
+  #define __STREXW(value, ptr)                                                 __strex(value, ptr)
+#else
+  #define __STREXW(value, ptr)   _Pragma("push") _Pragma("diag_suppress 3731") __strex(value, ptr)        _Pragma("pop")
+#endif
+
+
+/**
+  \brief   Remove the exclusive lock
+  \details Removes the exclusive lock which is created by LDREX.
+ */
+#define __CLREX                           __clrex
+
+
+/**
+  \brief   Signed Saturate
+  \details Saturates a signed value.
+  \param [in]  value  Value to be saturated
+  \param [in]    sat  Bit position to saturate to (1..32)
+  \return             Saturated value
+ */
+#define __SSAT                            __ssat
+
+
+/**
+  \brief   Unsigned Saturate
+  \details Saturates an unsigned value.
+  \param [in]  value  Value to be saturated
+  \param [in]    sat  Bit position to saturate to (0..31)
+  \return             Saturated value
+ */
+#define __USAT                            __usat
+
+
+/**
+  \brief   Rotate Right with Extend (32 bit)
+  \details Moves each bit of a bitstring right by one bit.
+           The carry input is shifted in at the left end of the bitstring.
+  \param [in]    value  Value to rotate
+  \return               Rotated value
+ */
+#ifndef __NO_EMBEDDED_ASM
+__attribute__((section(".rrx_text"))) __STATIC_INLINE __ASM uint32_t __RRX(uint32_t value)
+{
+  rrx r0, r0
+  bx lr
+}
+#endif
+
+
+/**
+  \brief   LDRT Unprivileged (8 bit)
+  \details Executes a Unprivileged LDRT instruction for 8 bit value.
+  \param [in]    ptr  Pointer to data
+  \return             value of type uint8_t at (*ptr)
+ */
+#define __LDRBT(ptr)                      ((uint8_t )  __ldrt(ptr))
+
+
+/**
+  \brief   LDRT Unprivileged (16 bit)
+  \details Executes a Unprivileged LDRT instruction for 16 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint16_t at (*ptr)
+ */
+#define __LDRHT(ptr)                      ((uint16_t)  __ldrt(ptr))
+
+
+/**
+  \brief   LDRT Unprivileged (32 bit)
+  \details Executes a Unprivileged LDRT instruction for 32 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint32_t at (*ptr)
+ */
+#define __LDRT(ptr)                       ((uint32_t ) __ldrt(ptr))
+
+
+/**
+  \brief   STRT Unprivileged (8 bit)
+  \details Executes a Unprivileged STRT instruction for 8 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+ */
+#define __STRBT(value, ptr)               __strt(value, ptr)
+
+
+/**
+  \brief   STRT Unprivileged (16 bit)
+  \details Executes a Unprivileged STRT instruction for 16 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+ */
+#define __STRHT(value, ptr)               __strt(value, ptr)
+
+
+/**
+  \brief   STRT Unprivileged (32 bit)
+  \details Executes a Unprivileged STRT instruction for 32 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+ */
+#define __STRT(value, ptr)                __strt(value, ptr)
+
+#else  /* ((defined (__ARM_ARCH_7M__ ) && (__ARM_ARCH_7M__  == 1)) || \
+           (defined (__ARM_ARCH_7EM__) && (__ARM_ARCH_7EM__ == 1))     ) */
+
+/**
+  \brief   Signed Saturate
+  \details Saturates a signed value.
+  \param [in]  value  Value to be saturated
+  \param [in]    sat  Bit position to saturate to (1..32)
+  \return             Saturated value
+ */
+__attribute__((always_inline)) __STATIC_INLINE int32_t __SSAT(int32_t val, uint32_t sat)
+{
+  if ((sat >= 1U) && (sat <= 32U))
+  {
+    const int32_t max = (int32_t)((1U << (sat - 1U)) - 1U);
+    const int32_t min = -1 - max ;
+    if (val > max)
+    {
+      return max;
+    }
+    else if (val < min)
+    {
+      return min;
+    }
+  }
+  return val;
+}
+
+/**
+  \brief   Unsigned Saturate
+  \details Saturates an unsigned value.
+  \param [in]  value  Value to be saturated
+  \param [in]    sat  Bit position to saturate to (0..31)
+  \return             Saturated value
+ */
+__attribute__((always_inline)) __STATIC_INLINE uint32_t __USAT(int32_t val, uint32_t sat)
+{
+  if (sat <= 31U)
+  {
+    const uint32_t max = ((1U << sat) - 1U);
+    if (val > (int32_t)max)
+    {
+      return max;
+    }
+    else if (val < 0)
+    {
+      return 0U;
+    }
+  }
+  return (uint32_t)val;
+}
+
+#endif /* ((defined (__ARM_ARCH_7M__ ) && (__ARM_ARCH_7M__  == 1)) || \
+           (defined (__ARM_ARCH_7EM__) && (__ARM_ARCH_7EM__ == 1))     ) */
+
+/*@}*/ /* end of group CMSIS_Core_InstructionInterface */
+
+
+/* ###################  Compiler specific Intrinsics  ########################### */
+/** \defgroup CMSIS_SIMD_intrinsics CMSIS SIMD Intrinsics
+  Access to dedicated SIMD instructions
+  @{
+*/
+
+#if ((defined (__ARM_ARCH_7EM__) && (__ARM_ARCH_7EM__ == 1))     )
+
+#define __SADD8                           __sadd8
+#define __QADD8                           __qadd8
+#define __SHADD8                          __shadd8
+#define __UADD8                           __uadd8
+#define __UQADD8                          __uqadd8
+#define __UHADD8                          __uhadd8
+#define __SSUB8                           __ssub8
+#define __QSUB8                           __qsub8
+#define __SHSUB8                          __shsub8
+#define __USUB8                           __usub8
+#define __UQSUB8                          __uqsub8
+#define __UHSUB8                          __uhsub8
+#define __SADD16                          __sadd16
+#define __QADD16                          __qadd16
+#define __SHADD16                         __shadd16
+#define __UADD16                          __uadd16
+#define __UQADD16                         __uqadd16
+#define __UHADD16                         __uhadd16
+#define __SSUB16                          __ssub16
+#define __QSUB16                          __qsub16
+#define __SHSUB16                         __shsub16
+#define __USUB16                          __usub16
+#define __UQSUB16                         __uqsub16
+#define __UHSUB16                         __uhsub16
+#define __SASX                            __sasx
+#define __QASX                            __qasx
+#define __SHASX                           __shasx
+#define __UASX                            __uasx
+#define __UQASX                           __uqasx
+#define __UHASX                           __uhasx
+#define __SSAX                            __ssax
+#define __QSAX                            __qsax
+#define __SHSAX                           __shsax
+#define __USAX                            __usax
+#define __UQSAX                           __uqsax
+#define __UHSAX                           __uhsax
+#define __USAD8                           __usad8
+#define __USADA8                          __usada8
+#define __SSAT16                          __ssat16
+#define __USAT16                          __usat16
+#define __UXTB16                          __uxtb16
+#define __UXTAB16                         __uxtab16
+#define __SXTB16                          __sxtb16
+#define __SXTAB16                         __sxtab16
+#define __SMUAD                           __smuad
+#define __SMUADX                          __smuadx
+#define __SMLAD                           __smlad
+#define __SMLADX                          __smladx
+#define __SMLALD                          __smlald
+#define __SMLALDX                         __smlaldx
+#define __SMUSD                           __smusd
+#define __SMUSDX                          __smusdx
+#define __SMLSD                           __smlsd
+#define __SMLSDX                          __smlsdx
+#define __SMLSLD                          __smlsld
+#define __SMLSLDX                         __smlsldx
+#define __SEL                             __sel
+#define __QADD                            __qadd
+#define __QSUB                            __qsub
+
+#define __PKHBT(ARG1,ARG2,ARG3)          ( ((((uint32_t)(ARG1))          ) & 0x0000FFFFUL) |  \
+                                           ((((uint32_t)(ARG2)) << (ARG3)) & 0xFFFF0000UL)  )
+
+#define __PKHTB(ARG1,ARG2,ARG3)          ( ((((uint32_t)(ARG1))          ) & 0xFFFF0000UL) |  \
+                                           ((((uint32_t)(ARG2)) >> (ARG3)) & 0x0000FFFFUL)  )
+
+#define __SMMLA(ARG1,ARG2,ARG3)          ( (int32_t)((((int64_t)(ARG1) * (ARG2)) + \
+                                                      ((int64_t)(ARG3) << 32U)     ) >> 32U))
+
+#define __SXTB16_RORn(ARG1, ARG2)        __SXTB16(__ROR(ARG1, ARG2))
+
+#endif /* ((defined (__ARM_ARCH_7EM__) && (__ARM_ARCH_7EM__ == 1))     ) */
+/*@} end of group CMSIS_SIMD_intrinsics */
+
+
+#endif /* __CMSIS_ARMCC_H */

+ 1467 - 0
libraries/cmsis/cm4/core_support/cmsis_armclang.h

@@ -0,0 +1,1467 @@
+/******************************************************************************
+ * @file     cmsis_armclang.h
+ * @brief    CMSIS compiler armclang (Arm Compiler 6) header file
+ * @version  V5.3.1
+ * @date     26. March 2020
+ ******************************************************************************/
+/*
+ * Copyright (c) 2009-2020 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*lint -esym(9058, IRQn)*/ /* disable MISRA 2012 Rule 2.4 for IRQn */
+
+#ifndef __CMSIS_ARMCLANG_H
+#define __CMSIS_ARMCLANG_H
+
+#pragma clang system_header   /* treat file as system include file */
+
+#ifndef __ARM_COMPAT_H
+#include <arm_compat.h>    /* Compatibility header for Arm Compiler 5 intrinsics */
+#endif
+
+/* CMSIS compiler specific defines */
+#ifndef   __ASM
+  #define __ASM                                  __asm
+#endif
+#ifndef   __INLINE
+  #define __INLINE                               __inline
+#endif
+#ifndef   __STATIC_INLINE
+  #define __STATIC_INLINE                        static __inline
+#endif
+#ifndef   __STATIC_FORCEINLINE
+  #define __STATIC_FORCEINLINE                   __attribute__((always_inline)) static __inline
+#endif
+#ifndef   __NO_RETURN
+  #define __NO_RETURN                            __attribute__((__noreturn__))
+#endif
+#ifndef   __USED
+  #define __USED                                 __attribute__((used))
+#endif
+#ifndef   __WEAK
+  #define __WEAK                                 __attribute__((weak))
+#endif
+#ifndef   __PACKED
+  #define __PACKED                               __attribute__((packed, aligned(1)))
+#endif
+#ifndef   __PACKED_STRUCT
+  #define __PACKED_STRUCT                        struct __attribute__((packed, aligned(1)))
+#endif
+#ifndef   __PACKED_UNION
+  #define __PACKED_UNION                         union __attribute__((packed, aligned(1)))
+#endif
+#ifndef   __UNALIGNED_UINT32        /* deprecated */
+  #pragma clang diagnostic push
+  #pragma clang diagnostic ignored "-Wpacked"
+/*lint -esym(9058, T_UINT32)*/ /* disable MISRA 2012 Rule 2.4 for T_UINT32 */
+  struct __attribute__((packed)) T_UINT32 { uint32_t v; };
+  #pragma clang diagnostic pop
+  #define __UNALIGNED_UINT32(x)                  (((struct T_UINT32 *)(x))->v)
+#endif
+#ifndef   __UNALIGNED_UINT16_WRITE
+  #pragma clang diagnostic push
+  #pragma clang diagnostic ignored "-Wpacked"
+/*lint -esym(9058, T_UINT16_WRITE)*/ /* disable MISRA 2012 Rule 2.4 for T_UINT16_WRITE */
+  __PACKED_STRUCT T_UINT16_WRITE { uint16_t v; };
+  #pragma clang diagnostic pop
+  #define __UNALIGNED_UINT16_WRITE(addr, val)    (void)((((struct T_UINT16_WRITE *)(void *)(addr))->v) = (val))
+#endif
+#ifndef   __UNALIGNED_UINT16_READ
+  #pragma clang diagnostic push
+  #pragma clang diagnostic ignored "-Wpacked"
+/*lint -esym(9058, T_UINT16_READ)*/ /* disable MISRA 2012 Rule 2.4 for T_UINT16_READ */
+  __PACKED_STRUCT T_UINT16_READ { uint16_t v; };
+  #pragma clang diagnostic pop
+  #define __UNALIGNED_UINT16_READ(addr)          (((const struct T_UINT16_READ *)(const void *)(addr))->v)
+#endif
+#ifndef   __UNALIGNED_UINT32_WRITE
+  #pragma clang diagnostic push
+  #pragma clang diagnostic ignored "-Wpacked"
+/*lint -esym(9058, T_UINT32_WRITE)*/ /* disable MISRA 2012 Rule 2.4 for T_UINT32_WRITE */
+  __PACKED_STRUCT T_UINT32_WRITE { uint32_t v; };
+  #pragma clang diagnostic pop
+  #define __UNALIGNED_UINT32_WRITE(addr, val)    (void)((((struct T_UINT32_WRITE *)(void *)(addr))->v) = (val))
+#endif
+#ifndef   __UNALIGNED_UINT32_READ
+  #pragma clang diagnostic push
+  #pragma clang diagnostic ignored "-Wpacked"
+/*lint -esym(9058, T_UINT32_READ)*/ /* disable MISRA 2012 Rule 2.4 for T_UINT32_READ */
+  __PACKED_STRUCT T_UINT32_READ { uint32_t v; };
+  #pragma clang diagnostic pop
+  #define __UNALIGNED_UINT32_READ(addr)          (((const struct T_UINT32_READ *)(const void *)(addr))->v)
+#endif
+#ifndef   __ALIGNED
+  #define __ALIGNED(x)                           __attribute__((aligned(x)))
+#endif
+#ifndef   __RESTRICT
+  #define __RESTRICT                             __restrict
+#endif
+#ifndef   __COMPILER_BARRIER
+  #define __COMPILER_BARRIER()                   __ASM volatile("":::"memory")
+#endif
+
+/* #########################  Startup and Lowlevel Init  ######################## */
+
+#ifndef __PROGRAM_START
+#define __PROGRAM_START           __main
+#endif
+
+#ifndef __INITIAL_SP
+#define __INITIAL_SP              Image$$ARM_LIB_STACK$$ZI$$Limit
+#endif
+
+#ifndef __STACK_LIMIT
+#define __STACK_LIMIT             Image$$ARM_LIB_STACK$$ZI$$Base
+#endif
+
+#ifndef __VECTOR_TABLE
+#define __VECTOR_TABLE            __Vectors
+#endif
+
+#ifndef __VECTOR_TABLE_ATTRIBUTE
+#define __VECTOR_TABLE_ATTRIBUTE  __attribute__((used, section("RESET")))
+#endif
+
+/* ###########################  Core Function Access  ########################### */
+/** \ingroup  CMSIS_Core_FunctionInterface
+    \defgroup CMSIS_Core_RegAccFunctions CMSIS Core Register Access Functions
+  @{
+ */
+
+/**
+  \brief   Enable IRQ Interrupts
+  \details Enables IRQ interrupts by clearing the I-bit in the CPSR.
+           Can only be executed in Privileged modes.
+ */
+/* intrinsic void __enable_irq();  see arm_compat.h */
+
+
+/**
+  \brief   Disable IRQ Interrupts
+  \details Disables IRQ interrupts by setting the I-bit in the CPSR.
+           Can only be executed in Privileged modes.
+ */
+/* intrinsic void __disable_irq();  see arm_compat.h */
+
+
+/**
+  \brief   Get Control Register
+  \details Returns the content of the Control Register.
+  \return               Control Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_CONTROL(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, control" : "=r" (result) );
+  return(result);
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Control Register (non-secure)
+  \details Returns the content of the non-secure Control Register when in secure mode.
+  \return               non-secure Control Register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_CONTROL_NS(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, control_ns" : "=r" (result) );
+  return(result);
+}
+#endif
+
+
+/**
+  \brief   Set Control Register
+  \details Writes the given value to the Control Register.
+  \param [in]    control  Control Register value to set
+ */
+__STATIC_FORCEINLINE void __set_CONTROL(uint32_t control)
+{
+  __ASM volatile ("MSR control, %0" : : "r" (control) : "memory");
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Set Control Register (non-secure)
+  \details Writes the given value to the non-secure Control Register when in secure state.
+  \param [in]    control  Control Register value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_CONTROL_NS(uint32_t control)
+{
+  __ASM volatile ("MSR control_ns, %0" : : "r" (control) : "memory");
+}
+#endif
+
+
+/**
+  \brief   Get IPSR Register
+  \details Returns the content of the IPSR Register.
+  \return               IPSR Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_IPSR(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, ipsr" : "=r" (result) );
+  return(result);
+}
+
+
+/**
+  \brief   Get APSR Register
+  \details Returns the content of the APSR Register.
+  \return               APSR Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_APSR(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, apsr" : "=r" (result) );
+  return(result);
+}
+
+
+/**
+  \brief   Get xPSR Register
+  \details Returns the content of the xPSR Register.
+  \return               xPSR Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_xPSR(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, xpsr" : "=r" (result) );
+  return(result);
+}
+
+
+/**
+  \brief   Get Process Stack Pointer
+  \details Returns the current value of the Process Stack Pointer (PSP).
+  \return               PSP Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_PSP(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, psp"  : "=r" (result) );
+  return(result);
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Process Stack Pointer (non-secure)
+  \details Returns the current value of the non-secure Process Stack Pointer (PSP) when in secure state.
+  \return               PSP Register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_PSP_NS(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, psp_ns"  : "=r" (result) );
+  return(result);
+}
+#endif
+
+
+/**
+  \brief   Set Process Stack Pointer
+  \details Assigns the given value to the Process Stack Pointer (PSP).
+  \param [in]    topOfProcStack  Process Stack Pointer value to set
+ */
+__STATIC_FORCEINLINE void __set_PSP(uint32_t topOfProcStack)
+{
+  __ASM volatile ("MSR psp, %0" : : "r" (topOfProcStack) : );
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Set Process Stack Pointer (non-secure)
+  \details Assigns the given value to the non-secure Process Stack Pointer (PSP) when in secure state.
+  \param [in]    topOfProcStack  Process Stack Pointer value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_PSP_NS(uint32_t topOfProcStack)
+{
+  __ASM volatile ("MSR psp_ns, %0" : : "r" (topOfProcStack) : );
+}
+#endif
+
+
+/**
+  \brief   Get Main Stack Pointer
+  \details Returns the current value of the Main Stack Pointer (MSP).
+  \return               MSP Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_MSP(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, msp" : "=r" (result) );
+  return(result);
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Main Stack Pointer (non-secure)
+  \details Returns the current value of the non-secure Main Stack Pointer (MSP) when in secure state.
+  \return               MSP Register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_MSP_NS(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, msp_ns" : "=r" (result) );
+  return(result);
+}
+#endif
+
+
+/**
+  \brief   Set Main Stack Pointer
+  \details Assigns the given value to the Main Stack Pointer (MSP).
+  \param [in]    topOfMainStack  Main Stack Pointer value to set
+ */
+__STATIC_FORCEINLINE void __set_MSP(uint32_t topOfMainStack)
+{
+  __ASM volatile ("MSR msp, %0" : : "r" (topOfMainStack) : );
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Set Main Stack Pointer (non-secure)
+  \details Assigns the given value to the non-secure Main Stack Pointer (MSP) when in secure state.
+  \param [in]    topOfMainStack  Main Stack Pointer value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_MSP_NS(uint32_t topOfMainStack)
+{
+  __ASM volatile ("MSR msp_ns, %0" : : "r" (topOfMainStack) : );
+}
+#endif
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Stack Pointer (non-secure)
+  \details Returns the current value of the non-secure Stack Pointer (SP) when in secure state.
+  \return               SP Register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_SP_NS(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, sp_ns" : "=r" (result) );
+  return(result);
+}
+
+
+/**
+  \brief   Set Stack Pointer (non-secure)
+  \details Assigns the given value to the non-secure Stack Pointer (SP) when in secure state.
+  \param [in]    topOfStack  Stack Pointer value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_SP_NS(uint32_t topOfStack)
+{
+  __ASM volatile ("MSR sp_ns, %0" : : "r" (topOfStack) : );
+}
+#endif
+
+
+/**
+  \brief   Get Priority Mask
+  \details Returns the current state of the priority mask bit from the Priority Mask Register.
+  \return               Priority Mask value
+ */
+__STATIC_FORCEINLINE uint32_t __get_PRIMASK(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, primask" : "=r" (result) );
+  return(result);
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Priority Mask (non-secure)
+  \details Returns the current state of the non-secure priority mask bit from the Priority Mask Register when in secure state.
+  \return               Priority Mask value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_PRIMASK_NS(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, primask_ns" : "=r" (result) );
+  return(result);
+}
+#endif
+
+
+/**
+  \brief   Set Priority Mask
+  \details Assigns the given value to the Priority Mask Register.
+  \param [in]    priMask  Priority Mask
+ */
+__STATIC_FORCEINLINE void __set_PRIMASK(uint32_t priMask)
+{
+  __ASM volatile ("MSR primask, %0" : : "r" (priMask) : "memory");
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Set Priority Mask (non-secure)
+  \details Assigns the given value to the non-secure Priority Mask Register when in secure state.
+  \param [in]    priMask  Priority Mask
+ */
+__STATIC_FORCEINLINE void __TZ_set_PRIMASK_NS(uint32_t priMask)
+{
+  __ASM volatile ("MSR primask_ns, %0" : : "r" (priMask) : "memory");
+}
+#endif
+
+
+#if ((defined (__ARM_ARCH_7M__       ) && (__ARM_ARCH_7M__        == 1)) || \
+     (defined (__ARM_ARCH_7EM__      ) && (__ARM_ARCH_7EM__       == 1)) || \
+     (defined (__ARM_ARCH_8M_MAIN__  ) && (__ARM_ARCH_8M_MAIN__   == 1)) || \
+     (defined (__ARM_ARCH_8_1M_MAIN__) && (__ARM_ARCH_8_1M_MAIN__ == 1))     )
+/**
+  \brief   Enable FIQ
+  \details Enables FIQ interrupts by clearing the F-bit in the CPSR.
+           Can only be executed in Privileged modes.
+ */
+#define __enable_fault_irq                __enable_fiq   /* see arm_compat.h */
+
+
+/**
+  \brief   Disable FIQ
+  \details Disables FIQ interrupts by setting the F-bit in the CPSR.
+           Can only be executed in Privileged modes.
+ */
+#define __disable_fault_irq               __disable_fiq   /* see arm_compat.h */
+
+
+/**
+  \brief   Get Base Priority
+  \details Returns the current value of the Base Priority register.
+  \return               Base Priority register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_BASEPRI(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, basepri" : "=r" (result) );
+  return(result);
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Base Priority (non-secure)
+  \details Returns the current value of the non-secure Base Priority register when in secure state.
+  \return               Base Priority register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_BASEPRI_NS(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, basepri_ns" : "=r" (result) );
+  return(result);
+}
+#endif
+
+
+/**
+  \brief   Set Base Priority
+  \details Assigns the given value to the Base Priority register.
+  \param [in]    basePri  Base Priority value to set
+ */
+__STATIC_FORCEINLINE void __set_BASEPRI(uint32_t basePri)
+{
+  __ASM volatile ("MSR basepri, %0" : : "r" (basePri) : "memory");
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Set Base Priority (non-secure)
+  \details Assigns the given value to the non-secure Base Priority register when in secure state.
+  \param [in]    basePri  Base Priority value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_BASEPRI_NS(uint32_t basePri)
+{
+  __ASM volatile ("MSR basepri_ns, %0" : : "r" (basePri) : "memory");
+}
+#endif
+
+
+/**
+  \brief   Set Base Priority with condition
+  \details Assigns the given value to the Base Priority register only if BASEPRI masking is disabled,
+           or the new value increases the BASEPRI priority level.
+  \param [in]    basePri  Base Priority value to set
+ */
+__STATIC_FORCEINLINE void __set_BASEPRI_MAX(uint32_t basePri)
+{
+  __ASM volatile ("MSR basepri_max, %0" : : "r" (basePri) : "memory");
+}
+
+
+/**
+  \brief   Get Fault Mask
+  \details Returns the current value of the Fault Mask register.
+  \return               Fault Mask register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_FAULTMASK(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, faultmask" : "=r" (result) );
+  return(result);
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Fault Mask (non-secure)
+  \details Returns the current value of the non-secure Fault Mask register when in secure state.
+  \return               Fault Mask register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_FAULTMASK_NS(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, faultmask_ns" : "=r" (result) );
+  return(result);
+}
+#endif
+
+
+/**
+  \brief   Set Fault Mask
+  \details Assigns the given value to the Fault Mask register.
+  \param [in]    faultMask  Fault Mask value to set
+ */
+__STATIC_FORCEINLINE void __set_FAULTMASK(uint32_t faultMask)
+{
+  __ASM volatile ("MSR faultmask, %0" : : "r" (faultMask) : "memory");
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Set Fault Mask (non-secure)
+  \details Assigns the given value to the non-secure Fault Mask register when in secure state.
+  \param [in]    faultMask  Fault Mask value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_FAULTMASK_NS(uint32_t faultMask)
+{
+  __ASM volatile ("MSR faultmask_ns, %0" : : "r" (faultMask) : "memory");
+}
+#endif
+
+#endif /* ((defined (__ARM_ARCH_7M__       ) && (__ARM_ARCH_7M__        == 1)) || \
+           (defined (__ARM_ARCH_7EM__      ) && (__ARM_ARCH_7EM__       == 1)) || \
+           (defined (__ARM_ARCH_8M_MAIN__  ) && (__ARM_ARCH_8M_MAIN__   == 1)) || \
+           (defined (__ARM_ARCH_8_1M_MAIN__) && (__ARM_ARCH_8_1M_MAIN__ == 1))     ) */
+
+
+#if ((defined (__ARM_ARCH_8M_MAIN__  ) && (__ARM_ARCH_8M_MAIN__   == 1)) || \
+     (defined (__ARM_ARCH_8M_BASE__  ) && (__ARM_ARCH_8M_BASE__   == 1)) || \
+     (defined (__ARM_ARCH_8_1M_MAIN__) && (__ARM_ARCH_8_1M_MAIN__ == 1))     )
+
+/**
+  \brief   Get Process Stack Pointer Limit
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence zero is returned always in non-secure
+  mode.
+
+  \details Returns the current value of the Process Stack Pointer Limit (PSPLIM).
+  \return               PSPLIM Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_PSPLIM(void)
+{
+#if (!((defined (__ARM_ARCH_8M_MAIN__   ) && (__ARM_ARCH_8M_MAIN__   == 1)) || \
+       (defined (__ARM_ARCH_8_1M_MAIN__ ) && (__ARM_ARCH_8_1M_MAIN__ == 1))   ) && \
+    (!defined (__ARM_FEATURE_CMSE) || (__ARM_FEATURE_CMSE < 3)))
+    // without main extensions, the non-secure PSPLIM is RAZ/WI
+  return 0U;
+#else
+  uint32_t result;
+  __ASM volatile ("MRS %0, psplim"  : "=r" (result) );
+  return result;
+#endif
+}
+
+#if (defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Process Stack Pointer Limit (non-secure)
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence zero is returned always in non-secure
+  mode.
+
+  \details Returns the current value of the non-secure Process Stack Pointer Limit (PSPLIM) when in secure state.
+  \return               PSPLIM Register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_PSPLIM_NS(void)
+{
+#if (!((defined (__ARM_ARCH_8M_MAIN__   ) && (__ARM_ARCH_8M_MAIN__   == 1)) || \
+       (defined (__ARM_ARCH_8_1M_MAIN__ ) && (__ARM_ARCH_8_1M_MAIN__ == 1))   ) )
+  // without main extensions, the non-secure PSPLIM is RAZ/WI
+  return 0U;
+#else
+  uint32_t result;
+  __ASM volatile ("MRS %0, psplim_ns"  : "=r" (result) );
+  return result;
+#endif
+}
+#endif
+
+
+/**
+  \brief   Set Process Stack Pointer Limit
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence the write is silently ignored in non-secure
+  mode.
+
+  \details Assigns the given value to the Process Stack Pointer Limit (PSPLIM).
+  \param [in]    ProcStackPtrLimit  Process Stack Pointer Limit value to set
+ */
+__STATIC_FORCEINLINE void __set_PSPLIM(uint32_t ProcStackPtrLimit)
+{
+#if (!((defined (__ARM_ARCH_8M_MAIN__   ) && (__ARM_ARCH_8M_MAIN__   == 1)) || \
+       (defined (__ARM_ARCH_8_1M_MAIN__ ) && (__ARM_ARCH_8_1M_MAIN__ == 1))   ) && \
+    (!defined (__ARM_FEATURE_CMSE) || (__ARM_FEATURE_CMSE < 3)))
+  // without main extensions, the non-secure PSPLIM is RAZ/WI
+  (void)ProcStackPtrLimit;
+#else
+  __ASM volatile ("MSR psplim, %0" : : "r" (ProcStackPtrLimit));
+#endif
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE  ) && (__ARM_FEATURE_CMSE   == 3))
+/**
+  \brief   Set Process Stack Pointer (non-secure)
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence the write is silently ignored in non-secure
+  mode.
+
+  \details Assigns the given value to the non-secure Process Stack Pointer Limit (PSPLIM) when in secure state.
+  \param [in]    ProcStackPtrLimit  Process Stack Pointer Limit value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_PSPLIM_NS(uint32_t ProcStackPtrLimit)
+{
+#if (!((defined (__ARM_ARCH_8M_MAIN__   ) && (__ARM_ARCH_8M_MAIN__   == 1)) || \
+       (defined (__ARM_ARCH_8_1M_MAIN__ ) && (__ARM_ARCH_8_1M_MAIN__ == 1))   ) )
+  // without main extensions, the non-secure PSPLIM is RAZ/WI
+  (void)ProcStackPtrLimit;
+#else
+  __ASM volatile ("MSR psplim_ns, %0\n" : : "r" (ProcStackPtrLimit));
+#endif
+}
+#endif
+
+
+/**
+  \brief   Get Main Stack Pointer Limit
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence zero is returned always.
+
+  \details Returns the current value of the Main Stack Pointer Limit (MSPLIM).
+  \return               MSPLIM Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_MSPLIM(void)
+{
+#if (!((defined (__ARM_ARCH_8M_MAIN__   ) && (__ARM_ARCH_8M_MAIN__   == 1)) || \
+       (defined (__ARM_ARCH_8_1M_MAIN__ ) && (__ARM_ARCH_8_1M_MAIN__ == 1))   ) && \
+    (!defined (__ARM_FEATURE_CMSE) || (__ARM_FEATURE_CMSE < 3)))
+  // without main extensions, the non-secure MSPLIM is RAZ/WI
+  return 0U;
+#else
+  uint32_t result;
+  __ASM volatile ("MRS %0, msplim" : "=r" (result) );
+  return result;
+#endif
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE  ) && (__ARM_FEATURE_CMSE   == 3))
+/**
+  \brief   Get Main Stack Pointer Limit (non-secure)
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence zero is returned always.
+
+  \details Returns the current value of the non-secure Main Stack Pointer Limit(MSPLIM) when in secure state.
+  \return               MSPLIM Register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_MSPLIM_NS(void)
+{
+#if (!((defined (__ARM_ARCH_8M_MAIN__   ) && (__ARM_ARCH_8M_MAIN__   == 1)) || \
+       (defined (__ARM_ARCH_8_1M_MAIN__ ) && (__ARM_ARCH_8_1M_MAIN__ == 1))   ) )
+  // without main extensions, the non-secure MSPLIM is RAZ/WI
+  return 0U;
+#else
+  uint32_t result;
+  __ASM volatile ("MRS %0, msplim_ns" : "=r" (result) );
+  return result;
+#endif
+}
+#endif
+
+
+/**
+  \brief   Set Main Stack Pointer Limit
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence the write is silently ignored.
+
+  \details Assigns the given value to the Main Stack Pointer Limit (MSPLIM).
+  \param [in]    MainStackPtrLimit  Main Stack Pointer Limit value to set
+ */
+__STATIC_FORCEINLINE void __set_MSPLIM(uint32_t MainStackPtrLimit)
+{
+#if (!((defined (__ARM_ARCH_8M_MAIN__   ) && (__ARM_ARCH_8M_MAIN__   == 1)) || \
+       (defined (__ARM_ARCH_8_1M_MAIN__ ) && (__ARM_ARCH_8_1M_MAIN__ == 1))   ) && \
+    (!defined (__ARM_FEATURE_CMSE) || (__ARM_FEATURE_CMSE < 3)))
+  // without main extensions, the non-secure MSPLIM is RAZ/WI
+  (void)MainStackPtrLimit;
+#else
+  __ASM volatile ("MSR msplim, %0" : : "r" (MainStackPtrLimit));
+#endif
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE  ) && (__ARM_FEATURE_CMSE   == 3))
+/**
+  \brief   Set Main Stack Pointer Limit (non-secure)
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence the write is silently ignored.
+
+  \details Assigns the given value to the non-secure Main Stack Pointer Limit (MSPLIM) when in secure state.
+  \param [in]    MainStackPtrLimit  Main Stack Pointer value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_MSPLIM_NS(uint32_t MainStackPtrLimit)
+{
+#if (!((defined (__ARM_ARCH_8M_MAIN__   ) && (__ARM_ARCH_8M_MAIN__   == 1)) || \
+       (defined (__ARM_ARCH_8_1M_MAIN__ ) && (__ARM_ARCH_8_1M_MAIN__ == 1))   ) )
+  // without main extensions, the non-secure MSPLIM is RAZ/WI
+  (void)MainStackPtrLimit;
+#else
+  __ASM volatile ("MSR msplim_ns, %0" : : "r" (MainStackPtrLimit));
+#endif
+}
+#endif
+
+#endif /* ((defined (__ARM_ARCH_8M_MAIN__  ) && (__ARM_ARCH_8M_MAIN__   == 1)) || \
+           (defined (__ARM_ARCH_8M_BASE__  ) && (__ARM_ARCH_8M_BASE__   == 1)) || \
+           (defined (__ARM_ARCH_8_1M_MAIN__) && (__ARM_ARCH_8_1M_MAIN__ == 1))     ) */
+
+/**
+  \brief   Get FPSCR
+  \details Returns the current value of the Floating Point Status/Control register.
+  \return               Floating Point Status/Control register value
+ */
+#if ((defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)) && \
+     (defined (__FPU_USED   ) && (__FPU_USED    == 1U))     )
+#define __get_FPSCR      (uint32_t)__builtin_arm_get_fpscr
+#else
+#define __get_FPSCR()      ((uint32_t)0U)
+#endif
+
+/**
+  \brief   Set FPSCR
+  \details Assigns the given value to the Floating Point Status/Control register.
+  \param [in]    fpscr  Floating Point Status/Control value to set
+ */
+#if ((defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)) && \
+     (defined (__FPU_USED   ) && (__FPU_USED    == 1U))     )
+#define __set_FPSCR      __builtin_arm_set_fpscr
+#else
+#define __set_FPSCR(x)      ((void)(x))
+#endif
+
+
+/*@} end of CMSIS_Core_RegAccFunctions */
+
+
+/* ##########################  Core Instruction Access  ######################### */
+/** \defgroup CMSIS_Core_InstructionInterface CMSIS Core Instruction Interface
+  Access to dedicated instructions
+  @{
+*/
+
+/* Define macros for porting to both thumb1 and thumb2.
+ * For thumb1, use low register (r0-r7), specified by constraint "l"
+ * Otherwise, use general registers, specified by constraint "r" */
+#if defined (__thumb__) && !defined (__thumb2__)
+#define __CMSIS_GCC_OUT_REG(r) "=l" (r)
+#define __CMSIS_GCC_RW_REG(r) "+l" (r)
+#define __CMSIS_GCC_USE_REG(r) "l" (r)
+#else
+#define __CMSIS_GCC_OUT_REG(r) "=r" (r)
+#define __CMSIS_GCC_RW_REG(r) "+r" (r)
+#define __CMSIS_GCC_USE_REG(r) "r" (r)
+#endif
+
+/**
+  \brief   No Operation
+  \details No Operation does nothing. This instruction can be used for code alignment purposes.
+ */
+#define __NOP          __builtin_arm_nop
+
+/**
+  \brief   Wait For Interrupt
+  \details Wait For Interrupt is a hint instruction that suspends execution until one of a number of events occurs.
+ */
+#define __WFI          __builtin_arm_wfi
+
+
+/**
+  \brief   Wait For Event
+  \details Wait For Event is a hint instruction that permits the processor to enter
+           a low-power state until one of a number of events occurs.
+ */
+#define __WFE          __builtin_arm_wfe
+
+
+/**
+  \brief   Send Event
+  \details Send Event is a hint instruction. It causes an event to be signaled to the CPU.
+ */
+#define __SEV          __builtin_arm_sev
+
+
+/**
+  \brief   Instruction Synchronization Barrier
+  \details Instruction Synchronization Barrier flushes the pipeline in the processor,
+           so that all instructions following the ISB are fetched from cache or memory,
+           after the instruction has been completed.
+ */
+#define __ISB()        __builtin_arm_isb(0xF)
+
+/**
+  \brief   Data Synchronization Barrier
+  \details Acts as a special kind of Data Memory Barrier.
+           It completes when all explicit memory accesses before this instruction complete.
+ */
+#define __DSB()        __builtin_arm_dsb(0xF)
+
+
+/**
+  \brief   Data Memory Barrier
+  \details Ensures the apparent order of the explicit memory operations before
+           and after the instruction, without ensuring their completion.
+ */
+#define __DMB()        __builtin_arm_dmb(0xF)
+
+
+/**
+  \brief   Reverse byte order (32 bit)
+  \details Reverses the byte order in unsigned integer value. For example, 0x12345678 becomes 0x78563412.
+  \param [in]    value  Value to reverse
+  \return               Reversed value
+ */
+#define __REV(value)   __builtin_bswap32(value)
+
+
+/**
+  \brief   Reverse byte order (16 bit)
+  \details Reverses the byte order within each halfword of a word. For example, 0x12345678 becomes 0x34127856.
+  \param [in]    value  Value to reverse
+  \return               Reversed value
+ */
+#define __REV16(value) __ROR(__REV(value), 16)
+
+
+/**
+  \brief   Reverse byte order (16 bit)
+  \details Reverses the byte order in a 16-bit value and returns the signed 16-bit result. For example, 0x0080 becomes 0x8000.
+  \param [in]    value  Value to reverse
+  \return               Reversed value
+ */
+#define __REVSH(value) (int16_t)__builtin_bswap16(value)
+
+
+/**
+  \brief   Rotate Right in unsigned value (32 bit)
+  \details Rotate Right (immediate) provides the value of the contents of a register rotated by a variable number of bits.
+  \param [in]    op1  Value to rotate
+  \param [in]    op2  Number of Bits to rotate
+  \return               Rotated value
+ */
+__STATIC_FORCEINLINE uint32_t __ROR(uint32_t op1, uint32_t op2)
+{
+  op2 %= 32U;
+  if (op2 == 0U)
+  {
+    return op1;
+  }
+  return (op1 >> op2) | (op1 << (32U - op2));
+}
+
+
+/**
+  \brief   Breakpoint
+  \details Causes the processor to enter Debug state.
+           Debug tools can use this to investigate system state when the instruction at a particular address is reached.
+  \param [in]    value  is ignored by the processor.
+                 If required, a debugger can use it to store additional information about the breakpoint.
+ */
+#define __BKPT(value)     __ASM volatile ("bkpt "#value)
+
+
+/**
+  \brief   Reverse bit order of value
+  \details Reverses the bit order of the given value.
+  \param [in]    value  Value to reverse
+  \return               Reversed value
+ */
+#define __RBIT            __builtin_arm_rbit
+
+/**
+  \brief   Count leading zeros
+  \details Counts the number of leading zeros of a data value.
+  \param [in]  value  Value to count the leading zeros
+  \return             number of leading zeros in value
+ */
+__STATIC_FORCEINLINE uint8_t __CLZ(uint32_t value)
+{
+  /* Even though __builtin_clz produces a CLZ instruction on ARM, formally
+     __builtin_clz(0) is undefined behaviour, so handle this case specially.
+     This guarantees ARM-compatible results if happening to compile on a non-ARM
+     target, and ensures the compiler doesn't decide to activate any
+     optimisations using the logic "value was passed to __builtin_clz, so it
+     is non-zero".
+     ARM Compiler 6.10 and possibly earlier will optimise this test away, leaving a
+     single CLZ instruction.
+   */
+  if (value == 0U)
+  {
+    return 32U;
+  }
+  return __builtin_clz(value);
+}
+
+
+#if ((defined (__ARM_ARCH_7M__       ) && (__ARM_ARCH_7M__        == 1)) || \
+     (defined (__ARM_ARCH_7EM__      ) && (__ARM_ARCH_7EM__       == 1)) || \
+     (defined (__ARM_ARCH_8M_MAIN__  ) && (__ARM_ARCH_8M_MAIN__   == 1)) || \
+     (defined (__ARM_ARCH_8M_BASE__  ) && (__ARM_ARCH_8M_BASE__   == 1)) || \
+     (defined (__ARM_ARCH_8_1M_MAIN__) && (__ARM_ARCH_8_1M_MAIN__ == 1))     )
+
+/**
+  \brief   LDR Exclusive (8 bit)
+  \details Executes a exclusive LDR instruction for 8 bit value.
+  \param [in]    ptr  Pointer to data
+  \return             value of type uint8_t at (*ptr)
+ */
+#define __LDREXB        (uint8_t)__builtin_arm_ldrex
+
+
+/**
+  \brief   LDR Exclusive (16 bit)
+  \details Executes a exclusive LDR instruction for 16 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint16_t at (*ptr)
+ */
+#define __LDREXH        (uint16_t)__builtin_arm_ldrex
+
+
+/**
+  \brief   LDR Exclusive (32 bit)
+  \details Executes a exclusive LDR instruction for 32 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint32_t at (*ptr)
+ */
+#define __LDREXW        (uint32_t)__builtin_arm_ldrex
+
+
+/**
+  \brief   STR Exclusive (8 bit)
+  \details Executes a exclusive STR instruction for 8 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+  \return          0  Function succeeded
+  \return          1  Function failed
+ */
+#define __STREXB        (uint32_t)__builtin_arm_strex
+
+
+/**
+  \brief   STR Exclusive (16 bit)
+  \details Executes a exclusive STR instruction for 16 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+  \return          0  Function succeeded
+  \return          1  Function failed
+ */
+#define __STREXH        (uint32_t)__builtin_arm_strex
+
+
+/**
+  \brief   STR Exclusive (32 bit)
+  \details Executes a exclusive STR instruction for 32 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+  \return          0  Function succeeded
+  \return          1  Function failed
+ */
+#define __STREXW        (uint32_t)__builtin_arm_strex
+
+
+/**
+  \brief   Remove the exclusive lock
+  \details Removes the exclusive lock which is created by LDREX.
+ */
+#define __CLREX             __builtin_arm_clrex
+
+#endif /* ((defined (__ARM_ARCH_7M__       ) && (__ARM_ARCH_7M__        == 1)) || \
+           (defined (__ARM_ARCH_7EM__      ) && (__ARM_ARCH_7EM__       == 1)) || \
+           (defined (__ARM_ARCH_8M_MAIN__  ) && (__ARM_ARCH_8M_MAIN__   == 1)) || \
+           (defined (__ARM_ARCH_8M_BASE__  ) && (__ARM_ARCH_8M_BASE__   == 1)) || \
+           (defined (__ARM_ARCH_8_1M_MAIN__) && (__ARM_ARCH_8_1M_MAIN__ == 1))     ) */
+
+
+#if ((defined (__ARM_ARCH_7M__       ) && (__ARM_ARCH_7M__        == 1)) || \
+     (defined (__ARM_ARCH_7EM__      ) && (__ARM_ARCH_7EM__       == 1)) || \
+     (defined (__ARM_ARCH_8M_MAIN__  ) && (__ARM_ARCH_8M_MAIN__   == 1)) || \
+     (defined (__ARM_ARCH_8_1M_MAIN__) && (__ARM_ARCH_8_1M_MAIN__ == 1))     )
+
+/**
+  \brief   Signed Saturate
+  \details Saturates a signed value.
+  \param [in]  value  Value to be saturated
+  \param [in]    sat  Bit position to saturate to (1..32)
+  \return             Saturated value
+ */
+#define __SSAT             __builtin_arm_ssat
+
+
+/**
+  \brief   Unsigned Saturate
+  \details Saturates an unsigned value.
+  \param [in]  value  Value to be saturated
+  \param [in]    sat  Bit position to saturate to (0..31)
+  \return             Saturated value
+ */
+#define __USAT             __builtin_arm_usat
+
+
+/**
+  \brief   Rotate Right with Extend (32 bit)
+  \details Moves each bit of a bitstring right by one bit.
+           The carry input is shifted in at the left end of the bitstring.
+  \param [in]    value  Value to rotate
+  \return               Rotated value
+ */
+__STATIC_FORCEINLINE uint32_t __RRX(uint32_t value)
+{
+  uint32_t result;
+
+  __ASM volatile ("rrx %0, %1" : __CMSIS_GCC_OUT_REG (result) : __CMSIS_GCC_USE_REG (value) );
+  return(result);
+}
+
+
+/**
+  \brief   LDRT Unprivileged (8 bit)
+  \details Executes a Unprivileged LDRT instruction for 8 bit value.
+  \param [in]    ptr  Pointer to data
+  \return             value of type uint8_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint8_t __LDRBT(volatile uint8_t *ptr)
+{
+  uint32_t result;
+
+  __ASM volatile ("ldrbt %0, %1" : "=r" (result) : "Q" (*ptr) );
+  return ((uint8_t) result);    /* Add explicit type cast here */
+}
+
+
+/**
+  \brief   LDRT Unprivileged (16 bit)
+  \details Executes a Unprivileged LDRT instruction for 16 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint16_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint16_t __LDRHT(volatile uint16_t *ptr)
+{
+  uint32_t result;
+
+  __ASM volatile ("ldrht %0, %1" : "=r" (result) : "Q" (*ptr) );
+  return ((uint16_t) result);    /* Add explicit type cast here */
+}
+
+
+/**
+  \brief   LDRT Unprivileged (32 bit)
+  \details Executes a Unprivileged LDRT instruction for 32 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint32_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint32_t __LDRT(volatile uint32_t *ptr)
+{
+  uint32_t result;
+
+  __ASM volatile ("ldrt %0, %1" : "=r" (result) : "Q" (*ptr) );
+  return(result);
+}
+
+
+/**
+  \brief   STRT Unprivileged (8 bit)
+  \details Executes a Unprivileged STRT instruction for 8 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+ */
+__STATIC_FORCEINLINE void __STRBT(uint8_t value, volatile uint8_t *ptr)
+{
+  __ASM volatile ("strbt %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) );
+}
+
+
+/**
+  \brief   STRT Unprivileged (16 bit)
+  \details Executes a Unprivileged STRT instruction for 16 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+ */
+__STATIC_FORCEINLINE void __STRHT(uint16_t value, volatile uint16_t *ptr)
+{
+  __ASM volatile ("strht %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) );
+}
+
+
+/**
+  \brief   STRT Unprivileged (32 bit)
+  \details Executes a Unprivileged STRT instruction for 32 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+ */
+__STATIC_FORCEINLINE void __STRT(uint32_t value, volatile uint32_t *ptr)
+{
+  __ASM volatile ("strt %1, %0" : "=Q" (*ptr) : "r" (value) );
+}
+
+#else /* ((defined (__ARM_ARCH_7M__       ) && (__ARM_ARCH_7M__        == 1)) || \
+          (defined (__ARM_ARCH_7EM__      ) && (__ARM_ARCH_7EM__       == 1)) || \
+          (defined (__ARM_ARCH_8M_MAIN__  ) && (__ARM_ARCH_8M_MAIN__   == 1)) || \
+          (defined (__ARM_ARCH_8_1M_MAIN__) && (__ARM_ARCH_8_1M_MAIN__ == 1))     ) */
+
+/**
+  \brief   Signed Saturate
+  \details Saturates a signed value.
+  \param [in]  value  Value to be saturated
+  \param [in]    sat  Bit position to saturate to (1..32)
+  \return             Saturated value
+ */
+__STATIC_FORCEINLINE int32_t __SSAT(int32_t val, uint32_t sat)
+{
+  if ((sat >= 1U) && (sat <= 32U))
+  {
+    const int32_t max = (int32_t)((1U << (sat - 1U)) - 1U);
+    const int32_t min = -1 - max ;
+    if (val > max)
+    {
+      return max;
+    }
+    else if (val < min)
+    {
+      return min;
+    }
+  }
+  return val;
+}
+
+/**
+  \brief   Unsigned Saturate
+  \details Saturates an unsigned value.
+  \param [in]  value  Value to be saturated
+  \param [in]    sat  Bit position to saturate to (0..31)
+  \return             Saturated value
+ */
+__STATIC_FORCEINLINE uint32_t __USAT(int32_t val, uint32_t sat)
+{
+  if (sat <= 31U)
+  {
+    const uint32_t max = ((1U << sat) - 1U);
+    if (val > (int32_t)max)
+    {
+      return max;
+    }
+    else if (val < 0)
+    {
+      return 0U;
+    }
+  }
+  return (uint32_t)val;
+}
+
+#endif /* ((defined (__ARM_ARCH_7M__       ) && (__ARM_ARCH_7M__        == 1)) || \
+           (defined (__ARM_ARCH_7EM__      ) && (__ARM_ARCH_7EM__       == 1)) || \
+           (defined (__ARM_ARCH_8M_MAIN__  ) && (__ARM_ARCH_8M_MAIN__   == 1)) || \
+           (defined (__ARM_ARCH_8_1M_MAIN__) && (__ARM_ARCH_8_1M_MAIN__ == 1))     ) */
+
+
+#if ((defined (__ARM_ARCH_8M_MAIN__  ) && (__ARM_ARCH_8M_MAIN__   == 1)) || \
+     (defined (__ARM_ARCH_8M_BASE__  ) && (__ARM_ARCH_8M_BASE__   == 1)) || \
+     (defined (__ARM_ARCH_8_1M_MAIN__) && (__ARM_ARCH_8_1M_MAIN__ == 1))     )
+
+/**
+  \brief   Load-Acquire (8 bit)
+  \details Executes a LDAB instruction for 8 bit value.
+  \param [in]    ptr  Pointer to data
+  \return             value of type uint8_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint8_t __LDAB(volatile uint8_t *ptr)
+{
+  uint32_t result;
+
+  __ASM volatile ("ldab %0, %1" : "=r" (result) : "Q" (*ptr) : "memory" );
+  return ((uint8_t) result);
+}
+
+
+/**
+  \brief   Load-Acquire (16 bit)
+  \details Executes a LDAH instruction for 16 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint16_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint16_t __LDAH(volatile uint16_t *ptr)
+{
+  uint32_t result;
+
+  __ASM volatile ("ldah %0, %1" : "=r" (result) : "Q" (*ptr) : "memory" );
+  return ((uint16_t) result);
+}
+
+
+/**
+  \brief   Load-Acquire (32 bit)
+  \details Executes a LDA instruction for 32 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint32_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint32_t __LDA(volatile uint32_t *ptr)
+{
+  uint32_t result;
+
+  __ASM volatile ("lda %0, %1" : "=r" (result) : "Q" (*ptr) : "memory" );
+  return(result);
+}
+
+
+/**
+  \brief   Store-Release (8 bit)
+  \details Executes a STLB instruction for 8 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+ */
+__STATIC_FORCEINLINE void __STLB(uint8_t value, volatile uint8_t *ptr)
+{
+  __ASM volatile ("stlb %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) : "memory" );
+}
+
+
+/**
+  \brief   Store-Release (16 bit)
+  \details Executes a STLH instruction for 16 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+ */
+__STATIC_FORCEINLINE void __STLH(uint16_t value, volatile uint16_t *ptr)
+{
+  __ASM volatile ("stlh %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) : "memory" );
+}
+
+
+/**
+  \brief   Store-Release (32 bit)
+  \details Executes a STL instruction for 32 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+ */
+__STATIC_FORCEINLINE void __STL(uint32_t value, volatile uint32_t *ptr)
+{
+  __ASM volatile ("stl %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) : "memory" );
+}
+
+
+/**
+  \brief   Load-Acquire Exclusive (8 bit)
+  \details Executes a LDAB exclusive instruction for 8 bit value.
+  \param [in]    ptr  Pointer to data
+  \return             value of type uint8_t at (*ptr)
+ */
+#define     __LDAEXB                 (uint8_t)__builtin_arm_ldaex
+
+
+/**
+  \brief   Load-Acquire Exclusive (16 bit)
+  \details Executes a LDAH exclusive instruction for 16 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint16_t at (*ptr)
+ */
+#define     __LDAEXH                 (uint16_t)__builtin_arm_ldaex
+
+
+/**
+  \brief   Load-Acquire Exclusive (32 bit)
+  \details Executes a LDA exclusive instruction for 32 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint32_t at (*ptr)
+ */
+#define     __LDAEX                  (uint32_t)__builtin_arm_ldaex
+
+
+/**
+  \brief   Store-Release Exclusive (8 bit)
+  \details Executes a STLB exclusive instruction for 8 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+  \return          0  Function succeeded
+  \return          1  Function failed
+ */
+#define     __STLEXB                 (uint32_t)__builtin_arm_stlex
+
+
+/**
+  \brief   Store-Release Exclusive (16 bit)
+  \details Executes a STLH exclusive instruction for 16 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+  \return          0  Function succeeded
+  \return          1  Function failed
+ */
+#define     __STLEXH                 (uint32_t)__builtin_arm_stlex
+
+
+/**
+  \brief   Store-Release Exclusive (32 bit)
+  \details Executes a STL exclusive instruction for 32 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+  \return          0  Function succeeded
+  \return          1  Function failed
+ */
+#define     __STLEX                  (uint32_t)__builtin_arm_stlex
+
+#endif /* ((defined (__ARM_ARCH_8M_MAIN__  ) && (__ARM_ARCH_8M_MAIN__   == 1)) || \
+           (defined (__ARM_ARCH_8M_BASE__  ) && (__ARM_ARCH_8M_BASE__   == 1)) || \
+           (defined (__ARM_ARCH_8_1M_MAIN__) && (__ARM_ARCH_8_1M_MAIN__ == 1))     ) */
+
+/*@}*/ /* end of group CMSIS_Core_InstructionInterface */
+
+
+/* ###################  Compiler specific Intrinsics  ########################### */
+/** \defgroup CMSIS_SIMD_intrinsics CMSIS SIMD Intrinsics
+  Access to dedicated SIMD instructions
+  @{
+*/
+
+#if (defined (__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1))
+
+#define     __SADD8                 __builtin_arm_sadd8
+#define     __QADD8                 __builtin_arm_qadd8
+#define     __SHADD8                __builtin_arm_shadd8
+#define     __UADD8                 __builtin_arm_uadd8
+#define     __UQADD8                __builtin_arm_uqadd8
+#define     __UHADD8                __builtin_arm_uhadd8
+#define     __SSUB8                 __builtin_arm_ssub8
+#define     __QSUB8                 __builtin_arm_qsub8
+#define     __SHSUB8                __builtin_arm_shsub8
+#define     __USUB8                 __builtin_arm_usub8
+#define     __UQSUB8                __builtin_arm_uqsub8
+#define     __UHSUB8                __builtin_arm_uhsub8
+#define     __SADD16                __builtin_arm_sadd16
+#define     __QADD16                __builtin_arm_qadd16
+#define     __SHADD16               __builtin_arm_shadd16
+#define     __UADD16                __builtin_arm_uadd16
+#define     __UQADD16               __builtin_arm_uqadd16
+#define     __UHADD16               __builtin_arm_uhadd16
+#define     __SSUB16                __builtin_arm_ssub16
+#define     __QSUB16                __builtin_arm_qsub16
+#define     __SHSUB16               __builtin_arm_shsub16
+#define     __USUB16                __builtin_arm_usub16
+#define     __UQSUB16               __builtin_arm_uqsub16
+#define     __UHSUB16               __builtin_arm_uhsub16
+#define     __SASX                  __builtin_arm_sasx
+#define     __QASX                  __builtin_arm_qasx
+#define     __SHASX                 __builtin_arm_shasx
+#define     __UASX                  __builtin_arm_uasx
+#define     __UQASX                 __builtin_arm_uqasx
+#define     __UHASX                 __builtin_arm_uhasx
+#define     __SSAX                  __builtin_arm_ssax
+#define     __QSAX                  __builtin_arm_qsax
+#define     __SHSAX                 __builtin_arm_shsax
+#define     __USAX                  __builtin_arm_usax
+#define     __UQSAX                 __builtin_arm_uqsax
+#define     __UHSAX                 __builtin_arm_uhsax
+#define     __USAD8                 __builtin_arm_usad8
+#define     __USADA8                __builtin_arm_usada8
+#define     __SSAT16                __builtin_arm_ssat16
+#define     __USAT16                __builtin_arm_usat16
+#define     __UXTB16                __builtin_arm_uxtb16
+#define     __UXTAB16               __builtin_arm_uxtab16
+#define     __SXTB16                __builtin_arm_sxtb16
+#define     __SXTAB16               __builtin_arm_sxtab16
+#define     __SMUAD                 __builtin_arm_smuad
+#define     __SMUADX                __builtin_arm_smuadx
+#define     __SMLAD                 __builtin_arm_smlad
+#define     __SMLADX                __builtin_arm_smladx
+#define     __SMLALD                __builtin_arm_smlald
+#define     __SMLALDX               __builtin_arm_smlaldx
+#define     __SMUSD                 __builtin_arm_smusd
+#define     __SMUSDX                __builtin_arm_smusdx
+#define     __SMLSD                 __builtin_arm_smlsd
+#define     __SMLSDX                __builtin_arm_smlsdx
+#define     __SMLSLD                __builtin_arm_smlsld
+#define     __SMLSLDX               __builtin_arm_smlsldx
+#define     __SEL                   __builtin_arm_sel
+#define     __QADD                  __builtin_arm_qadd
+#define     __QSUB                  __builtin_arm_qsub
+
+#define __PKHBT(ARG1,ARG2,ARG3)          ( ((((uint32_t)(ARG1))          ) & 0x0000FFFFUL) |  \
+                                           ((((uint32_t)(ARG2)) << (ARG3)) & 0xFFFF0000UL)  )
+
+#define __PKHTB(ARG1,ARG2,ARG3)          ( ((((uint32_t)(ARG1))          ) & 0xFFFF0000UL) |  \
+                                           ((((uint32_t)(ARG2)) >> (ARG3)) & 0x0000FFFFUL)  )
+
+#define __SXTB16_RORn(ARG1, ARG2)        __SXTB16(__ROR(ARG1, ARG2))
+
+__STATIC_FORCEINLINE int32_t __SMMLA (int32_t op1, int32_t op2, int32_t op3)
+{
+  int32_t result;
+
+  __ASM volatile ("smmla %0, %1, %2, %3" : "=r" (result): "r"  (op1), "r" (op2), "r" (op3) );
+  return(result);
+}
+
+#endif /* (__ARM_FEATURE_DSP == 1) */
+/*@} end of group CMSIS_SIMD_intrinsics */
+
+
+#endif /* __CMSIS_ARMCLANG_H */

+ 1893 - 0
libraries/cmsis/cm4/core_support/cmsis_armclang_ltm.h

@@ -0,0 +1,1893 @@
+/******************************************************************************
+ * @file     cmsis_armclang_ltm.h
+ * @brief    CMSIS compiler armclang (Arm Compiler 6) header file
+ * @version  V1.3.0
+ * @date     26. March 2020
+ ******************************************************************************/
+/*
+ * Copyright (c) 2018-2020 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*lint -esym(9058, IRQn)*/ /* disable MISRA 2012 Rule 2.4 for IRQn */
+
+#ifndef __CMSIS_ARMCLANG_H
+#define __CMSIS_ARMCLANG_H
+
+#pragma clang system_header   /* treat file as system include file */
+
+#ifndef __ARM_COMPAT_H
+#include <arm_compat.h>    /* Compatibility header for Arm Compiler 5 intrinsics */
+#endif
+
+/* CMSIS compiler specific defines */
+#ifndef   __ASM
+  #define __ASM                                  __asm
+#endif
+#ifndef   __INLINE
+  #define __INLINE                               __inline
+#endif
+#ifndef   __STATIC_INLINE
+  #define __STATIC_INLINE                        static __inline
+#endif
+#ifndef   __STATIC_FORCEINLINE
+  #define __STATIC_FORCEINLINE                   __attribute__((always_inline)) static __inline
+#endif
+#ifndef   __NO_RETURN
+  #define __NO_RETURN                            __attribute__((__noreturn__))
+#endif
+#ifndef   __USED
+  #define __USED                                 __attribute__((used))
+#endif
+#ifndef   __WEAK
+  #define __WEAK                                 __attribute__((weak))
+#endif
+#ifndef   __PACKED
+  #define __PACKED                               __attribute__((packed, aligned(1)))
+#endif
+#ifndef   __PACKED_STRUCT
+  #define __PACKED_STRUCT                        struct __attribute__((packed, aligned(1)))
+#endif
+#ifndef   __PACKED_UNION
+  #define __PACKED_UNION                         union __attribute__((packed, aligned(1)))
+#endif
+#ifndef   __UNALIGNED_UINT32        /* deprecated */
+  #pragma clang diagnostic push
+  #pragma clang diagnostic ignored "-Wpacked"
+/*lint -esym(9058, T_UINT32)*/ /* disable MISRA 2012 Rule 2.4 for T_UINT32 */
+  struct __attribute__((packed)) T_UINT32 { uint32_t v; };
+  #pragma clang diagnostic pop
+  #define __UNALIGNED_UINT32(x)                  (((struct T_UINT32 *)(x))->v)
+#endif
+#ifndef   __UNALIGNED_UINT16_WRITE
+  #pragma clang diagnostic push
+  #pragma clang diagnostic ignored "-Wpacked"
+/*lint -esym(9058, T_UINT16_WRITE)*/ /* disable MISRA 2012 Rule 2.4 for T_UINT16_WRITE */
+  __PACKED_STRUCT T_UINT16_WRITE { uint16_t v; };
+  #pragma clang diagnostic pop
+  #define __UNALIGNED_UINT16_WRITE(addr, val)    (void)((((struct T_UINT16_WRITE *)(void *)(addr))->v) = (val))
+#endif
+#ifndef   __UNALIGNED_UINT16_READ
+  #pragma clang diagnostic push
+  #pragma clang diagnostic ignored "-Wpacked"
+/*lint -esym(9058, T_UINT16_READ)*/ /* disable MISRA 2012 Rule 2.4 for T_UINT16_READ */
+  __PACKED_STRUCT T_UINT16_READ { uint16_t v; };
+  #pragma clang diagnostic pop
+  #define __UNALIGNED_UINT16_READ(addr)          (((const struct T_UINT16_READ *)(const void *)(addr))->v)
+#endif
+#ifndef   __UNALIGNED_UINT32_WRITE
+  #pragma clang diagnostic push
+  #pragma clang diagnostic ignored "-Wpacked"
+/*lint -esym(9058, T_UINT32_WRITE)*/ /* disable MISRA 2012 Rule 2.4 for T_UINT32_WRITE */
+  __PACKED_STRUCT T_UINT32_WRITE { uint32_t v; };
+  #pragma clang diagnostic pop
+  #define __UNALIGNED_UINT32_WRITE(addr, val)    (void)((((struct T_UINT32_WRITE *)(void *)(addr))->v) = (val))
+#endif
+#ifndef   __UNALIGNED_UINT32_READ
+  #pragma clang diagnostic push
+  #pragma clang diagnostic ignored "-Wpacked"
+/*lint -esym(9058, T_UINT32_READ)*/ /* disable MISRA 2012 Rule 2.4 for T_UINT32_READ */
+  __PACKED_STRUCT T_UINT32_READ { uint32_t v; };
+  #pragma clang diagnostic pop
+  #define __UNALIGNED_UINT32_READ(addr)          (((const struct T_UINT32_READ *)(const void *)(addr))->v)
+#endif
+#ifndef   __ALIGNED
+  #define __ALIGNED(x)                           __attribute__((aligned(x)))
+#endif
+#ifndef   __RESTRICT
+  #define __RESTRICT                             __restrict
+#endif
+#ifndef   __COMPILER_BARRIER
+  #define __COMPILER_BARRIER()                   __ASM volatile("":::"memory")
+#endif
+
+/* #########################  Startup and Lowlevel Init  ######################## */
+
+#ifndef __PROGRAM_START
+#define __PROGRAM_START           __main
+#endif
+
+#ifndef __INITIAL_SP
+#define __INITIAL_SP              Image$$ARM_LIB_STACK$$ZI$$Limit
+#endif
+
+#ifndef __STACK_LIMIT
+#define __STACK_LIMIT             Image$$ARM_LIB_STACK$$ZI$$Base
+#endif
+
+#ifndef __VECTOR_TABLE
+#define __VECTOR_TABLE            __Vectors
+#endif
+
+#ifndef __VECTOR_TABLE_ATTRIBUTE
+#define __VECTOR_TABLE_ATTRIBUTE  __attribute__((used, section("RESET")))
+#endif
+
+
+/* ###########################  Core Function Access  ########################### */
+/** \ingroup  CMSIS_Core_FunctionInterface
+    \defgroup CMSIS_Core_RegAccFunctions CMSIS Core Register Access Functions
+  @{
+ */
+
+/**
+  \brief   Enable IRQ Interrupts
+  \details Enables IRQ interrupts by clearing the I-bit in the CPSR.
+           Can only be executed in Privileged modes.
+ */
+/* intrinsic void __enable_irq();  see arm_compat.h */
+
+
+/**
+  \brief   Disable IRQ Interrupts
+  \details Disables IRQ interrupts by setting the I-bit in the CPSR.
+           Can only be executed in Privileged modes.
+ */
+/* intrinsic void __disable_irq();  see arm_compat.h */
+
+
+/**
+  \brief   Get Control Register
+  \details Returns the content of the Control Register.
+  \return               Control Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_CONTROL(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, control" : "=r" (result) );
+  return(result);
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Control Register (non-secure)
+  \details Returns the content of the non-secure Control Register when in secure mode.
+  \return               non-secure Control Register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_CONTROL_NS(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, control_ns" : "=r" (result) );
+  return(result);
+}
+#endif
+
+
+/**
+  \brief   Set Control Register
+  \details Writes the given value to the Control Register.
+  \param [in]    control  Control Register value to set
+ */
+__STATIC_FORCEINLINE void __set_CONTROL(uint32_t control)
+{
+  __ASM volatile ("MSR control, %0" : : "r" (control) : "memory");
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Set Control Register (non-secure)
+  \details Writes the given value to the non-secure Control Register when in secure state.
+  \param [in]    control  Control Register value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_CONTROL_NS(uint32_t control)
+{
+  __ASM volatile ("MSR control_ns, %0" : : "r" (control) : "memory");
+}
+#endif
+
+
+/**
+  \brief   Get IPSR Register
+  \details Returns the content of the IPSR Register.
+  \return               IPSR Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_IPSR(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, ipsr" : "=r" (result) );
+  return(result);
+}
+
+
+/**
+  \brief   Get APSR Register
+  \details Returns the content of the APSR Register.
+  \return               APSR Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_APSR(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, apsr" : "=r" (result) );
+  return(result);
+}
+
+
+/**
+  \brief   Get xPSR Register
+  \details Returns the content of the xPSR Register.
+  \return               xPSR Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_xPSR(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, xpsr" : "=r" (result) );
+  return(result);
+}
+
+
+/**
+  \brief   Get Process Stack Pointer
+  \details Returns the current value of the Process Stack Pointer (PSP).
+  \return               PSP Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_PSP(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, psp"  : "=r" (result) );
+  return(result);
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Process Stack Pointer (non-secure)
+  \details Returns the current value of the non-secure Process Stack Pointer (PSP) when in secure state.
+  \return               PSP Register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_PSP_NS(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, psp_ns"  : "=r" (result) );
+  return(result);
+}
+#endif
+
+
+/**
+  \brief   Set Process Stack Pointer
+  \details Assigns the given value to the Process Stack Pointer (PSP).
+  \param [in]    topOfProcStack  Process Stack Pointer value to set
+ */
+__STATIC_FORCEINLINE void __set_PSP(uint32_t topOfProcStack)
+{
+  __ASM volatile ("MSR psp, %0" : : "r" (topOfProcStack) : );
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Set Process Stack Pointer (non-secure)
+  \details Assigns the given value to the non-secure Process Stack Pointer (PSP) when in secure state.
+  \param [in]    topOfProcStack  Process Stack Pointer value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_PSP_NS(uint32_t topOfProcStack)
+{
+  __ASM volatile ("MSR psp_ns, %0" : : "r" (topOfProcStack) : );
+}
+#endif
+
+
+/**
+  \brief   Get Main Stack Pointer
+  \details Returns the current value of the Main Stack Pointer (MSP).
+  \return               MSP Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_MSP(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, msp" : "=r" (result) );
+  return(result);
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Main Stack Pointer (non-secure)
+  \details Returns the current value of the non-secure Main Stack Pointer (MSP) when in secure state.
+  \return               MSP Register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_MSP_NS(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, msp_ns" : "=r" (result) );
+  return(result);
+}
+#endif
+
+
+/**
+  \brief   Set Main Stack Pointer
+  \details Assigns the given value to the Main Stack Pointer (MSP).
+  \param [in]    topOfMainStack  Main Stack Pointer value to set
+ */
+__STATIC_FORCEINLINE void __set_MSP(uint32_t topOfMainStack)
+{
+  __ASM volatile ("MSR msp, %0" : : "r" (topOfMainStack) : );
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Set Main Stack Pointer (non-secure)
+  \details Assigns the given value to the non-secure Main Stack Pointer (MSP) when in secure state.
+  \param [in]    topOfMainStack  Main Stack Pointer value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_MSP_NS(uint32_t topOfMainStack)
+{
+  __ASM volatile ("MSR msp_ns, %0" : : "r" (topOfMainStack) : );
+}
+#endif
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Stack Pointer (non-secure)
+  \details Returns the current value of the non-secure Stack Pointer (SP) when in secure state.
+  \return               SP Register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_SP_NS(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, sp_ns" : "=r" (result) );
+  return(result);
+}
+
+
+/**
+  \brief   Set Stack Pointer (non-secure)
+  \details Assigns the given value to the non-secure Stack Pointer (SP) when in secure state.
+  \param [in]    topOfStack  Stack Pointer value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_SP_NS(uint32_t topOfStack)
+{
+  __ASM volatile ("MSR sp_ns, %0" : : "r" (topOfStack) : );
+}
+#endif
+
+
+/**
+  \brief   Get Priority Mask
+  \details Returns the current state of the priority mask bit from the Priority Mask Register.
+  \return               Priority Mask value
+ */
+__STATIC_FORCEINLINE uint32_t __get_PRIMASK(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, primask" : "=r" (result) );
+  return(result);
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Priority Mask (non-secure)
+  \details Returns the current state of the non-secure priority mask bit from the Priority Mask Register when in secure state.
+  \return               Priority Mask value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_PRIMASK_NS(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, primask_ns" : "=r" (result) );
+  return(result);
+}
+#endif
+
+
+/**
+  \brief   Set Priority Mask
+  \details Assigns the given value to the Priority Mask Register.
+  \param [in]    priMask  Priority Mask
+ */
+__STATIC_FORCEINLINE void __set_PRIMASK(uint32_t priMask)
+{
+  __ASM volatile ("MSR primask, %0" : : "r" (priMask) : "memory");
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Set Priority Mask (non-secure)
+  \details Assigns the given value to the non-secure Priority Mask Register when in secure state.
+  \param [in]    priMask  Priority Mask
+ */
+__STATIC_FORCEINLINE void __TZ_set_PRIMASK_NS(uint32_t priMask)
+{
+  __ASM volatile ("MSR primask_ns, %0" : : "r" (priMask) : "memory");
+}
+#endif
+
+
+#if ((defined (__ARM_ARCH_7M__      ) && (__ARM_ARCH_7M__      == 1)) || \
+     (defined (__ARM_ARCH_7EM__     ) && (__ARM_ARCH_7EM__     == 1)) || \
+     (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1))    )
+/**
+  \brief   Enable FIQ
+  \details Enables FIQ interrupts by clearing the F-bit in the CPSR.
+           Can only be executed in Privileged modes.
+ */
+#define __enable_fault_irq                __enable_fiq   /* see arm_compat.h */
+
+
+/**
+  \brief   Disable FIQ
+  \details Disables FIQ interrupts by setting the F-bit in the CPSR.
+           Can only be executed in Privileged modes.
+ */
+#define __disable_fault_irq               __disable_fiq   /* see arm_compat.h */
+
+
+/**
+  \brief   Get Base Priority
+  \details Returns the current value of the Base Priority register.
+  \return               Base Priority register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_BASEPRI(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, basepri" : "=r" (result) );
+  return(result);
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Base Priority (non-secure)
+  \details Returns the current value of the non-secure Base Priority register when in secure state.
+  \return               Base Priority register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_BASEPRI_NS(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, basepri_ns" : "=r" (result) );
+  return(result);
+}
+#endif
+
+
+/**
+  \brief   Set Base Priority
+  \details Assigns the given value to the Base Priority register.
+  \param [in]    basePri  Base Priority value to set
+ */
+__STATIC_FORCEINLINE void __set_BASEPRI(uint32_t basePri)
+{
+  __ASM volatile ("MSR basepri, %0" : : "r" (basePri) : "memory");
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Set Base Priority (non-secure)
+  \details Assigns the given value to the non-secure Base Priority register when in secure state.
+  \param [in]    basePri  Base Priority value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_BASEPRI_NS(uint32_t basePri)
+{
+  __ASM volatile ("MSR basepri_ns, %0" : : "r" (basePri) : "memory");
+}
+#endif
+
+
+/**
+  \brief   Set Base Priority with condition
+  \details Assigns the given value to the Base Priority register only if BASEPRI masking is disabled,
+           or the new value increases the BASEPRI priority level.
+  \param [in]    basePri  Base Priority value to set
+ */
+__STATIC_FORCEINLINE void __set_BASEPRI_MAX(uint32_t basePri)
+{
+  __ASM volatile ("MSR basepri_max, %0" : : "r" (basePri) : "memory");
+}
+
+
+/**
+  \brief   Get Fault Mask
+  \details Returns the current value of the Fault Mask register.
+  \return               Fault Mask register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_FAULTMASK(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, faultmask" : "=r" (result) );
+  return(result);
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Fault Mask (non-secure)
+  \details Returns the current value of the non-secure Fault Mask register when in secure state.
+  \return               Fault Mask register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_FAULTMASK_NS(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, faultmask_ns" : "=r" (result) );
+  return(result);
+}
+#endif
+
+
+/**
+  \brief   Set Fault Mask
+  \details Assigns the given value to the Fault Mask register.
+  \param [in]    faultMask  Fault Mask value to set
+ */
+__STATIC_FORCEINLINE void __set_FAULTMASK(uint32_t faultMask)
+{
+  __ASM volatile ("MSR faultmask, %0" : : "r" (faultMask) : "memory");
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Set Fault Mask (non-secure)
+  \details Assigns the given value to the non-secure Fault Mask register when in secure state.
+  \param [in]    faultMask  Fault Mask value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_FAULTMASK_NS(uint32_t faultMask)
+{
+  __ASM volatile ("MSR faultmask_ns, %0" : : "r" (faultMask) : "memory");
+}
+#endif
+
+#endif /* ((defined (__ARM_ARCH_7M__      ) && (__ARM_ARCH_7M__      == 1)) || \
+           (defined (__ARM_ARCH_7EM__     ) && (__ARM_ARCH_7EM__     == 1)) || \
+           (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1))    ) */
+
+
+#if ((defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \
+     (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1))    )
+
+/**
+  \brief   Get Process Stack Pointer Limit
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence zero is returned always in non-secure
+  mode.
+
+  \details Returns the current value of the Process Stack Pointer Limit (PSPLIM).
+  \return               PSPLIM Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_PSPLIM(void)
+{
+#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) && \
+    (!defined (__ARM_FEATURE_CMSE) || (__ARM_FEATURE_CMSE < 3)))
+    // without main extensions, the non-secure PSPLIM is RAZ/WI
+  return 0U;
+#else
+  uint32_t result;
+  __ASM volatile ("MRS %0, psplim"  : "=r" (result) );
+  return result;
+#endif
+}
+
+#if (defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Process Stack Pointer Limit (non-secure)
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence zero is returned always in non-secure
+  mode.
+
+  \details Returns the current value of the non-secure Process Stack Pointer Limit (PSPLIM) when in secure state.
+  \return               PSPLIM Register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_PSPLIM_NS(void)
+{
+#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)))
+  // without main extensions, the non-secure PSPLIM is RAZ/WI
+  return 0U;
+#else
+  uint32_t result;
+  __ASM volatile ("MRS %0, psplim_ns"  : "=r" (result) );
+  return result;
+#endif
+}
+#endif
+
+
+/**
+  \brief   Set Process Stack Pointer Limit
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence the write is silently ignored in non-secure
+  mode.
+
+  \details Assigns the given value to the Process Stack Pointer Limit (PSPLIM).
+  \param [in]    ProcStackPtrLimit  Process Stack Pointer Limit value to set
+ */
+__STATIC_FORCEINLINE void __set_PSPLIM(uint32_t ProcStackPtrLimit)
+{
+#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) && \
+    (!defined (__ARM_FEATURE_CMSE) || (__ARM_FEATURE_CMSE < 3)))
+  // without main extensions, the non-secure PSPLIM is RAZ/WI
+  (void)ProcStackPtrLimit;
+#else
+  __ASM volatile ("MSR psplim, %0" : : "r" (ProcStackPtrLimit));
+#endif
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE  ) && (__ARM_FEATURE_CMSE   == 3))
+/**
+  \brief   Set Process Stack Pointer (non-secure)
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence the write is silently ignored in non-secure
+  mode.
+
+  \details Assigns the given value to the non-secure Process Stack Pointer Limit (PSPLIM) when in secure state.
+  \param [in]    ProcStackPtrLimit  Process Stack Pointer Limit value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_PSPLIM_NS(uint32_t ProcStackPtrLimit)
+{
+#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)))
+  // without main extensions, the non-secure PSPLIM is RAZ/WI
+  (void)ProcStackPtrLimit;
+#else
+  __ASM volatile ("MSR psplim_ns, %0\n" : : "r" (ProcStackPtrLimit));
+#endif
+}
+#endif
+
+
+/**
+  \brief   Get Main Stack Pointer Limit
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence zero is returned always.
+
+  \details Returns the current value of the Main Stack Pointer Limit (MSPLIM).
+  \return               MSPLIM Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_MSPLIM(void)
+{
+#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) && \
+    (!defined (__ARM_FEATURE_CMSE) || (__ARM_FEATURE_CMSE < 3)))
+  // without main extensions, the non-secure MSPLIM is RAZ/WI
+  return 0U;
+#else
+  uint32_t result;
+  __ASM volatile ("MRS %0, msplim" : "=r" (result) );
+  return result;
+#endif
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE  ) && (__ARM_FEATURE_CMSE   == 3))
+/**
+  \brief   Get Main Stack Pointer Limit (non-secure)
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence zero is returned always.
+
+  \details Returns the current value of the non-secure Main Stack Pointer Limit(MSPLIM) when in secure state.
+  \return               MSPLIM Register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_MSPLIM_NS(void)
+{
+#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)))
+  // without main extensions, the non-secure MSPLIM is RAZ/WI
+  return 0U;
+#else
+  uint32_t result;
+  __ASM volatile ("MRS %0, msplim_ns" : "=r" (result) );
+  return result;
+#endif
+}
+#endif
+
+
+/**
+  \brief   Set Main Stack Pointer Limit
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence the write is silently ignored.
+
+  \details Assigns the given value to the Main Stack Pointer Limit (MSPLIM).
+  \param [in]    MainStackPtrLimit  Main Stack Pointer Limit value to set
+ */
+__STATIC_FORCEINLINE void __set_MSPLIM(uint32_t MainStackPtrLimit)
+{
+#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) && \
+    (!defined (__ARM_FEATURE_CMSE) || (__ARM_FEATURE_CMSE < 3)))
+  // without main extensions, the non-secure MSPLIM is RAZ/WI
+  (void)MainStackPtrLimit;
+#else
+  __ASM volatile ("MSR msplim, %0" : : "r" (MainStackPtrLimit));
+#endif
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE  ) && (__ARM_FEATURE_CMSE   == 3))
+/**
+  \brief   Set Main Stack Pointer Limit (non-secure)
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence the write is silently ignored.
+
+  \details Assigns the given value to the non-secure Main Stack Pointer Limit (MSPLIM) when in secure state.
+  \param [in]    MainStackPtrLimit  Main Stack Pointer value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_MSPLIM_NS(uint32_t MainStackPtrLimit)
+{
+#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)))
+  // without main extensions, the non-secure MSPLIM is RAZ/WI
+  (void)MainStackPtrLimit;
+#else
+  __ASM volatile ("MSR msplim_ns, %0" : : "r" (MainStackPtrLimit));
+#endif
+}
+#endif
+
+#endif /* ((defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \
+           (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1))    ) */
+
+/**
+  \brief   Get FPSCR
+  \details Returns the current value of the Floating Point Status/Control register.
+  \return               Floating Point Status/Control register value
+ */
+#if ((defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)) && \
+     (defined (__FPU_USED   ) && (__FPU_USED    == 1U))     )
+#define __get_FPSCR      (uint32_t)__builtin_arm_get_fpscr
+#else
+#define __get_FPSCR()      ((uint32_t)0U)
+#endif
+
+/**
+  \brief   Set FPSCR
+  \details Assigns the given value to the Floating Point Status/Control register.
+  \param [in]    fpscr  Floating Point Status/Control value to set
+ */
+#if ((defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)) && \
+     (defined (__FPU_USED   ) && (__FPU_USED    == 1U))     )
+#define __set_FPSCR      __builtin_arm_set_fpscr
+#else
+#define __set_FPSCR(x)      ((void)(x))
+#endif
+
+
+/*@} end of CMSIS_Core_RegAccFunctions */
+
+
+/* ##########################  Core Instruction Access  ######################### */
+/** \defgroup CMSIS_Core_InstructionInterface CMSIS Core Instruction Interface
+  Access to dedicated instructions
+  @{
+*/
+
+/* Define macros for porting to both thumb1 and thumb2.
+ * For thumb1, use low register (r0-r7), specified by constraint "l"
+ * Otherwise, use general registers, specified by constraint "r" */
+#if defined (__thumb__) && !defined (__thumb2__)
+#define __CMSIS_GCC_OUT_REG(r) "=l" (r)
+#define __CMSIS_GCC_USE_REG(r) "l" (r)
+#else
+#define __CMSIS_GCC_OUT_REG(r) "=r" (r)
+#define __CMSIS_GCC_USE_REG(r) "r" (r)
+#endif
+
+/**
+  \brief   No Operation
+  \details No Operation does nothing. This instruction can be used for code alignment purposes.
+ */
+#define __NOP          __builtin_arm_nop
+
+/**
+  \brief   Wait For Interrupt
+  \details Wait For Interrupt is a hint instruction that suspends execution until one of a number of events occurs.
+ */
+#define __WFI          __builtin_arm_wfi
+
+
+/**
+  \brief   Wait For Event
+  \details Wait For Event is a hint instruction that permits the processor to enter
+           a low-power state until one of a number of events occurs.
+ */
+#define __WFE          __builtin_arm_wfe
+
+
+/**
+  \brief   Send Event
+  \details Send Event is a hint instruction. It causes an event to be signaled to the CPU.
+ */
+#define __SEV          __builtin_arm_sev
+
+
+/**
+  \brief   Instruction Synchronization Barrier
+  \details Instruction Synchronization Barrier flushes the pipeline in the processor,
+           so that all instructions following the ISB are fetched from cache or memory,
+           after the instruction has been completed.
+ */
+#define __ISB()        __builtin_arm_isb(0xF)
+
+/**
+  \brief   Data Synchronization Barrier
+  \details Acts as a special kind of Data Memory Barrier.
+           It completes when all explicit memory accesses before this instruction complete.
+ */
+#define __DSB()        __builtin_arm_dsb(0xF)
+
+
+/**
+  \brief   Data Memory Barrier
+  \details Ensures the apparent order of the explicit memory operations before
+           and after the instruction, without ensuring their completion.
+ */
+#define __DMB()        __builtin_arm_dmb(0xF)
+
+
+/**
+  \brief   Reverse byte order (32 bit)
+  \details Reverses the byte order in unsigned integer value. For example, 0x12345678 becomes 0x78563412.
+  \param [in]    value  Value to reverse
+  \return               Reversed value
+ */
+#define __REV(value)   __builtin_bswap32(value)
+
+
+/**
+  \brief   Reverse byte order (16 bit)
+  \details Reverses the byte order within each halfword of a word. For example, 0x12345678 becomes 0x34127856.
+  \param [in]    value  Value to reverse
+  \return               Reversed value
+ */
+#define __REV16(value) __ROR(__REV(value), 16)
+
+
+/**
+  \brief   Reverse byte order (16 bit)
+  \details Reverses the byte order in a 16-bit value and returns the signed 16-bit result. For example, 0x0080 becomes 0x8000.
+  \param [in]    value  Value to reverse
+  \return               Reversed value
+ */
+#define __REVSH(value) (int16_t)__builtin_bswap16(value)
+
+
+/**
+  \brief   Rotate Right in unsigned value (32 bit)
+  \details Rotate Right (immediate) provides the value of the contents of a register rotated by a variable number of bits.
+  \param [in]    op1  Value to rotate
+  \param [in]    op2  Number of Bits to rotate
+  \return               Rotated value
+ */
+__STATIC_FORCEINLINE uint32_t __ROR(uint32_t op1, uint32_t op2)
+{
+  op2 %= 32U;
+  if (op2 == 0U)
+  {
+    return op1;
+  }
+  return (op1 >> op2) | (op1 << (32U - op2));
+}
+
+
+/**
+  \brief   Breakpoint
+  \details Causes the processor to enter Debug state.
+           Debug tools can use this to investigate system state when the instruction at a particular address is reached.
+  \param [in]    value  is ignored by the processor.
+                 If required, a debugger can use it to store additional information about the breakpoint.
+ */
+#define __BKPT(value)     __ASM volatile ("bkpt "#value)
+
+
+/**
+  \brief   Reverse bit order of value
+  \details Reverses the bit order of the given value.
+  \param [in]    value  Value to reverse
+  \return               Reversed value
+ */
+#define __RBIT            __builtin_arm_rbit
+
+/**
+  \brief   Count leading zeros
+  \details Counts the number of leading zeros of a data value.
+  \param [in]  value  Value to count the leading zeros
+  \return             number of leading zeros in value
+ */
+__STATIC_FORCEINLINE uint8_t __CLZ(uint32_t value)
+{
+  /* Even though __builtin_clz produces a CLZ instruction on ARM, formally
+     __builtin_clz(0) is undefined behaviour, so handle this case specially.
+     This guarantees ARM-compatible results if happening to compile on a non-ARM
+     target, and ensures the compiler doesn't decide to activate any
+     optimisations using the logic "value was passed to __builtin_clz, so it
+     is non-zero".
+     ARM Compiler 6.10 and possibly earlier will optimise this test away, leaving a
+     single CLZ instruction.
+   */
+  if (value == 0U)
+  {
+    return 32U;
+  }
+  return __builtin_clz(value);
+}
+
+
+#if ((defined (__ARM_ARCH_7M__      ) && (__ARM_ARCH_7M__      == 1)) || \
+     (defined (__ARM_ARCH_7EM__     ) && (__ARM_ARCH_7EM__     == 1)) || \
+     (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \
+     (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1))    )
+/**
+  \brief   LDR Exclusive (8 bit)
+  \details Executes a exclusive LDR instruction for 8 bit value.
+  \param [in]    ptr  Pointer to data
+  \return             value of type uint8_t at (*ptr)
+ */
+#define __LDREXB        (uint8_t)__builtin_arm_ldrex
+
+
+/**
+  \brief   LDR Exclusive (16 bit)
+  \details Executes a exclusive LDR instruction for 16 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint16_t at (*ptr)
+ */
+#define __LDREXH        (uint16_t)__builtin_arm_ldrex
+
+
+/**
+  \brief   LDR Exclusive (32 bit)
+  \details Executes a exclusive LDR instruction for 32 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint32_t at (*ptr)
+ */
+#define __LDREXW        (uint32_t)__builtin_arm_ldrex
+
+
+/**
+  \brief   STR Exclusive (8 bit)
+  \details Executes a exclusive STR instruction for 8 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+  \return          0  Function succeeded
+  \return          1  Function failed
+ */
+#define __STREXB        (uint32_t)__builtin_arm_strex
+
+
+/**
+  \brief   STR Exclusive (16 bit)
+  \details Executes a exclusive STR instruction for 16 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+  \return          0  Function succeeded
+  \return          1  Function failed
+ */
+#define __STREXH        (uint32_t)__builtin_arm_strex
+
+
+/**
+  \brief   STR Exclusive (32 bit)
+  \details Executes a exclusive STR instruction for 32 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+  \return          0  Function succeeded
+  \return          1  Function failed
+ */
+#define __STREXW        (uint32_t)__builtin_arm_strex
+
+
+/**
+  \brief   Remove the exclusive lock
+  \details Removes the exclusive lock which is created by LDREX.
+ */
+#define __CLREX             __builtin_arm_clrex
+
+#endif /* ((defined (__ARM_ARCH_7M__      ) && (__ARM_ARCH_7M__      == 1)) || \
+           (defined (__ARM_ARCH_7EM__     ) && (__ARM_ARCH_7EM__     == 1)) || \
+           (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \
+           (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1))    ) */
+
+
+#if ((defined (__ARM_ARCH_7M__      ) && (__ARM_ARCH_7M__      == 1)) || \
+     (defined (__ARM_ARCH_7EM__     ) && (__ARM_ARCH_7EM__     == 1)) || \
+     (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1))    )
+
+/**
+  \brief   Signed Saturate
+  \details Saturates a signed value.
+  \param [in]  value  Value to be saturated
+  \param [in]    sat  Bit position to saturate to (1..32)
+  \return             Saturated value
+ */
+#define __SSAT             __builtin_arm_ssat
+
+
+/**
+  \brief   Unsigned Saturate
+  \details Saturates an unsigned value.
+  \param [in]  value  Value to be saturated
+  \param [in]    sat  Bit position to saturate to (0..31)
+  \return             Saturated value
+ */
+#define __USAT             __builtin_arm_usat
+
+
+/**
+  \brief   Rotate Right with Extend (32 bit)
+  \details Moves each bit of a bitstring right by one bit.
+           The carry input is shifted in at the left end of the bitstring.
+  \param [in]    value  Value to rotate
+  \return               Rotated value
+ */
+__STATIC_FORCEINLINE uint32_t __RRX(uint32_t value)
+{
+  uint32_t result;
+
+  __ASM volatile ("rrx %0, %1" : __CMSIS_GCC_OUT_REG (result) : __CMSIS_GCC_USE_REG (value) );
+  return(result);
+}
+
+
+/**
+  \brief   LDRT Unprivileged (8 bit)
+  \details Executes a Unprivileged LDRT instruction for 8 bit value.
+  \param [in]    ptr  Pointer to data
+  \return             value of type uint8_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint8_t __LDRBT(volatile uint8_t *ptr)
+{
+  uint32_t result;
+
+  __ASM volatile ("ldrbt %0, %1" : "=r" (result) : "Q" (*ptr) );
+  return ((uint8_t) result);    /* Add explicit type cast here */
+}
+
+
+/**
+  \brief   LDRT Unprivileged (16 bit)
+  \details Executes a Unprivileged LDRT instruction for 16 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint16_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint16_t __LDRHT(volatile uint16_t *ptr)
+{
+  uint32_t result;
+
+  __ASM volatile ("ldrht %0, %1" : "=r" (result) : "Q" (*ptr) );
+  return ((uint16_t) result);    /* Add explicit type cast here */
+}
+
+
+/**
+  \brief   LDRT Unprivileged (32 bit)
+  \details Executes a Unprivileged LDRT instruction for 32 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint32_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint32_t __LDRT(volatile uint32_t *ptr)
+{
+  uint32_t result;
+
+  __ASM volatile ("ldrt %0, %1" : "=r" (result) : "Q" (*ptr) );
+  return(result);
+}
+
+
+/**
+  \brief   STRT Unprivileged (8 bit)
+  \details Executes a Unprivileged STRT instruction for 8 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+ */
+__STATIC_FORCEINLINE void __STRBT(uint8_t value, volatile uint8_t *ptr)
+{
+  __ASM volatile ("strbt %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) );
+}
+
+
+/**
+  \brief   STRT Unprivileged (16 bit)
+  \details Executes a Unprivileged STRT instruction for 16 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+ */
+__STATIC_FORCEINLINE void __STRHT(uint16_t value, volatile uint16_t *ptr)
+{
+  __ASM volatile ("strht %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) );
+}
+
+
+/**
+  \brief   STRT Unprivileged (32 bit)
+  \details Executes a Unprivileged STRT instruction for 32 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+ */
+__STATIC_FORCEINLINE void __STRT(uint32_t value, volatile uint32_t *ptr)
+{
+  __ASM volatile ("strt %1, %0" : "=Q" (*ptr) : "r" (value) );
+}
+
+#else  /* ((defined (__ARM_ARCH_7M__      ) && (__ARM_ARCH_7M__      == 1)) || \
+           (defined (__ARM_ARCH_7EM__     ) && (__ARM_ARCH_7EM__     == 1)) || \
+           (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1))    ) */
+
+/**
+  \brief   Signed Saturate
+  \details Saturates a signed value.
+  \param [in]  value  Value to be saturated
+  \param [in]    sat  Bit position to saturate to (1..32)
+  \return             Saturated value
+ */
+__STATIC_FORCEINLINE int32_t __SSAT(int32_t val, uint32_t sat)
+{
+  if ((sat >= 1U) && (sat <= 32U))
+  {
+    const int32_t max = (int32_t)((1U << (sat - 1U)) - 1U);
+    const int32_t min = -1 - max ;
+    if (val > max)
+    {
+      return max;
+    }
+    else if (val < min)
+    {
+      return min;
+    }
+  }
+  return val;
+}
+
+/**
+  \brief   Unsigned Saturate
+  \details Saturates an unsigned value.
+  \param [in]  value  Value to be saturated
+  \param [in]    sat  Bit position to saturate to (0..31)
+  \return             Saturated value
+ */
+__STATIC_FORCEINLINE uint32_t __USAT(int32_t val, uint32_t sat)
+{
+  if (sat <= 31U)
+  {
+    const uint32_t max = ((1U << sat) - 1U);
+    if (val > (int32_t)max)
+    {
+      return max;
+    }
+    else if (val < 0)
+    {
+      return 0U;
+    }
+  }
+  return (uint32_t)val;
+}
+
+#endif /* ((defined (__ARM_ARCH_7M__      ) && (__ARM_ARCH_7M__      == 1)) || \
+           (defined (__ARM_ARCH_7EM__     ) && (__ARM_ARCH_7EM__     == 1)) || \
+           (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1))    ) */
+
+
+#if ((defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \
+     (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1))    )
+/**
+  \brief   Load-Acquire (8 bit)
+  \details Executes a LDAB instruction for 8 bit value.
+  \param [in]    ptr  Pointer to data
+  \return             value of type uint8_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint8_t __LDAB(volatile uint8_t *ptr)
+{
+  uint32_t result;
+
+  __ASM volatile ("ldab %0, %1" : "=r" (result) : "Q" (*ptr) : "memory" );
+  return ((uint8_t) result);
+}
+
+
+/**
+  \brief   Load-Acquire (16 bit)
+  \details Executes a LDAH instruction for 16 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint16_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint16_t __LDAH(volatile uint16_t *ptr)
+{
+  uint32_t result;
+
+  __ASM volatile ("ldah %0, %1" : "=r" (result) : "Q" (*ptr) : "memory" );
+  return ((uint16_t) result);
+}
+
+
+/**
+  \brief   Load-Acquire (32 bit)
+  \details Executes a LDA instruction for 32 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint32_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint32_t __LDA(volatile uint32_t *ptr)
+{
+  uint32_t result;
+
+  __ASM volatile ("lda %0, %1" : "=r" (result) : "Q" (*ptr) : "memory" );
+  return(result);
+}
+
+
+/**
+  \brief   Store-Release (8 bit)
+  \details Executes a STLB instruction for 8 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+ */
+__STATIC_FORCEINLINE void __STLB(uint8_t value, volatile uint8_t *ptr)
+{
+  __ASM volatile ("stlb %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) : "memory" );
+}
+
+
+/**
+  \brief   Store-Release (16 bit)
+  \details Executes a STLH instruction for 16 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+ */
+__STATIC_FORCEINLINE void __STLH(uint16_t value, volatile uint16_t *ptr)
+{
+  __ASM volatile ("stlh %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) : "memory" );
+}
+
+
+/**
+  \brief   Store-Release (32 bit)
+  \details Executes a STL instruction for 32 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+ */
+__STATIC_FORCEINLINE void __STL(uint32_t value, volatile uint32_t *ptr)
+{
+  __ASM volatile ("stl %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) : "memory" );
+}
+
+
+/**
+  \brief   Load-Acquire Exclusive (8 bit)
+  \details Executes a LDAB exclusive instruction for 8 bit value.
+  \param [in]    ptr  Pointer to data
+  \return             value of type uint8_t at (*ptr)
+ */
+#define     __LDAEXB                 (uint8_t)__builtin_arm_ldaex
+
+
+/**
+  \brief   Load-Acquire Exclusive (16 bit)
+  \details Executes a LDAH exclusive instruction for 16 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint16_t at (*ptr)
+ */
+#define     __LDAEXH                 (uint16_t)__builtin_arm_ldaex
+
+
+/**
+  \brief   Load-Acquire Exclusive (32 bit)
+  \details Executes a LDA exclusive instruction for 32 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint32_t at (*ptr)
+ */
+#define     __LDAEX                  (uint32_t)__builtin_arm_ldaex
+
+
+/**
+  \brief   Store-Release Exclusive (8 bit)
+  \details Executes a STLB exclusive instruction for 8 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+  \return          0  Function succeeded
+  \return          1  Function failed
+ */
+#define     __STLEXB                 (uint32_t)__builtin_arm_stlex
+
+
+/**
+  \brief   Store-Release Exclusive (16 bit)
+  \details Executes a STLH exclusive instruction for 16 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+  \return          0  Function succeeded
+  \return          1  Function failed
+ */
+#define     __STLEXH                 (uint32_t)__builtin_arm_stlex
+
+
+/**
+  \brief   Store-Release Exclusive (32 bit)
+  \details Executes a STL exclusive instruction for 32 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+  \return          0  Function succeeded
+  \return          1  Function failed
+ */
+#define     __STLEX                  (uint32_t)__builtin_arm_stlex
+
+#endif /* ((defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \
+           (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1))    ) */
+
+/*@}*/ /* end of group CMSIS_Core_InstructionInterface */
+
+
+/* ###################  Compiler specific Intrinsics  ########################### */
+/** \defgroup CMSIS_SIMD_intrinsics CMSIS SIMD Intrinsics
+  Access to dedicated SIMD instructions
+  @{
+*/
+
+#if (defined (__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1))
+
+__STATIC_FORCEINLINE uint32_t __SADD8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("sadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __QADD8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("qadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SHADD8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("shadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UADD8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("uadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UQADD8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("uqadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UHADD8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("uhadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+
+__STATIC_FORCEINLINE uint32_t __SSUB8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("ssub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __QSUB8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("qsub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SHSUB8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("shsub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __USUB8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("usub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UQSUB8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("uqsub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UHSUB8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("uhsub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+
+__STATIC_FORCEINLINE uint32_t __SADD16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("sadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __QADD16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("qadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SHADD16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("shadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UADD16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("uadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UQADD16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("uqadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UHADD16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("uhadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SSUB16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("ssub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __QSUB16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("qsub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SHSUB16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("shsub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __USUB16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("usub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UQSUB16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("uqsub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UHSUB16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("uhsub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SASX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("sasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __QASX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("qasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SHASX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("shasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UASX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("uasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UQASX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("uqasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UHASX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("uhasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SSAX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("ssax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __QSAX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("qsax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SHSAX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("shsax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __USAX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("usax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UQSAX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("uqsax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UHSAX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("uhsax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __USAD8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("usad8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __USADA8(uint32_t op1, uint32_t op2, uint32_t op3)
+{
+  uint32_t result;
+
+  __ASM volatile ("usada8 %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) );
+  return(result);
+}
+
+#define __SSAT16(ARG1,ARG2) \
+({                          \
+  int32_t __RES, __ARG1 = (ARG1); \
+  __ASM ("ssat16 %0, %1, %2" : "=r" (__RES) :  "I" (ARG2), "r" (__ARG1) ); \
+  __RES; \
+ })
+
+#define __USAT16(ARG1,ARG2) \
+({                          \
+  uint32_t __RES, __ARG1 = (ARG1); \
+  __ASM ("usat16 %0, %1, %2" : "=r" (__RES) :  "I" (ARG2), "r" (__ARG1) ); \
+  __RES; \
+ })
+
+__STATIC_FORCEINLINE uint32_t __UXTB16(uint32_t op1)
+{
+  uint32_t result;
+
+  __ASM volatile ("uxtb16 %0, %1" : "=r" (result) : "r" (op1));
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UXTAB16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("uxtab16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SXTB16(uint32_t op1)
+{
+  uint32_t result;
+
+  __ASM volatile ("sxtb16 %0, %1" : "=r" (result) : "r" (op1));
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SXTAB16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("sxtab16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SMUAD  (uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("smuad %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SMUADX (uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("smuadx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SMLAD (uint32_t op1, uint32_t op2, uint32_t op3)
+{
+  uint32_t result;
+
+  __ASM volatile ("smlad %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SMLADX (uint32_t op1, uint32_t op2, uint32_t op3)
+{
+  uint32_t result;
+
+  __ASM volatile ("smladx %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint64_t __SMLALD (uint32_t op1, uint32_t op2, uint64_t acc)
+{
+  union llreg_u{
+    uint32_t w32[2];
+    uint64_t w64;
+  } llr;
+  llr.w64 = acc;
+
+#ifndef __ARMEB__   /* Little endian */
+  __ASM volatile ("smlald %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) );
+#else               /* Big endian */
+  __ASM volatile ("smlald %0, %1, %2, %3" : "=r" (llr.w32[1]), "=r" (llr.w32[0]): "r" (op1), "r" (op2) , "0" (llr.w32[1]), "1" (llr.w32[0]) );
+#endif
+
+  return(llr.w64);
+}
+
+__STATIC_FORCEINLINE uint64_t __SMLALDX (uint32_t op1, uint32_t op2, uint64_t acc)
+{
+  union llreg_u{
+    uint32_t w32[2];
+    uint64_t w64;
+  } llr;
+  llr.w64 = acc;
+
+#ifndef __ARMEB__   /* Little endian */
+  __ASM volatile ("smlaldx %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) );
+#else               /* Big endian */
+  __ASM volatile ("smlaldx %0, %1, %2, %3" : "=r" (llr.w32[1]), "=r" (llr.w32[0]): "r" (op1), "r" (op2) , "0" (llr.w32[1]), "1" (llr.w32[0]) );
+#endif
+
+  return(llr.w64);
+}
+
+__STATIC_FORCEINLINE uint32_t __SMUSD  (uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("smusd %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SMUSDX (uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("smusdx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SMLSD (uint32_t op1, uint32_t op2, uint32_t op3)
+{
+  uint32_t result;
+
+  __ASM volatile ("smlsd %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SMLSDX (uint32_t op1, uint32_t op2, uint32_t op3)
+{
+  uint32_t result;
+
+  __ASM volatile ("smlsdx %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint64_t __SMLSLD (uint32_t op1, uint32_t op2, uint64_t acc)
+{
+  union llreg_u{
+    uint32_t w32[2];
+    uint64_t w64;
+  } llr;
+  llr.w64 = acc;
+
+#ifndef __ARMEB__   /* Little endian */
+  __ASM volatile ("smlsld %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) );
+#else               /* Big endian */
+  __ASM volatile ("smlsld %0, %1, %2, %3" : "=r" (llr.w32[1]), "=r" (llr.w32[0]): "r" (op1), "r" (op2) , "0" (llr.w32[1]), "1" (llr.w32[0]) );
+#endif
+
+  return(llr.w64);
+}
+
+__STATIC_FORCEINLINE uint64_t __SMLSLDX (uint32_t op1, uint32_t op2, uint64_t acc)
+{
+  union llreg_u{
+    uint32_t w32[2];
+    uint64_t w64;
+  } llr;
+  llr.w64 = acc;
+
+#ifndef __ARMEB__   /* Little endian */
+  __ASM volatile ("smlsldx %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) );
+#else               /* Big endian */
+  __ASM volatile ("smlsldx %0, %1, %2, %3" : "=r" (llr.w32[1]), "=r" (llr.w32[0]): "r" (op1), "r" (op2) , "0" (llr.w32[1]), "1" (llr.w32[0]) );
+#endif
+
+  return(llr.w64);
+}
+
+__STATIC_FORCEINLINE uint32_t __SEL  (uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("sel %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE  int32_t __QADD( int32_t op1,  int32_t op2)
+{
+  int32_t result;
+
+  __ASM volatile ("qadd %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE  int32_t __QSUB( int32_t op1,  int32_t op2)
+{
+  int32_t result;
+
+  __ASM volatile ("qsub %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+#define __PKHBT(ARG1,ARG2,ARG3)          ( ((((uint32_t)(ARG1))          ) & 0x0000FFFFUL) |  \
+                                           ((((uint32_t)(ARG2)) << (ARG3)) & 0xFFFF0000UL)  )
+
+#define __PKHTB(ARG1,ARG2,ARG3)          ( ((((uint32_t)(ARG1))          ) & 0xFFFF0000UL) |  \
+                                           ((((uint32_t)(ARG2)) >> (ARG3)) & 0x0000FFFFUL)  )
+
+#define __SXTB16_RORn(ARG1, ARG2)        __SXTB16(__ROR(ARG1, ARG2))
+
+__STATIC_FORCEINLINE int32_t __SMMLA (int32_t op1, int32_t op2, int32_t op3)
+{
+  int32_t result;
+
+  __ASM volatile ("smmla %0, %1, %2, %3" : "=r" (result): "r"  (op1), "r" (op2), "r" (op3) );
+  return(result);
+}
+
+#endif /* (__ARM_FEATURE_DSP == 1) */
+/*@} end of group CMSIS_SIMD_intrinsics */
+
+
+#endif /* __CMSIS_ARMCLANG_H */

+ 283 - 0
libraries/cmsis/cm4/core_support/cmsis_compiler.h

@@ -0,0 +1,283 @@
+/******************************************************************************
+ * @file     cmsis_compiler.h
+ * @brief    CMSIS compiler generic header file
+ * @version  V5.1.0
+ * @date     09. October 2018
+ ******************************************************************************/
+/*
+ * Copyright (c) 2009-2018 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __CMSIS_COMPILER_H
+#define __CMSIS_COMPILER_H
+
+#include <stdint.h>
+
+/*
+ * Arm Compiler 4/5
+ */
+#if   defined ( __CC_ARM )
+  #include "cmsis_armcc.h"
+
+
+/*
+ * Arm Compiler 6.6 LTM (armclang)
+ */
+#elif defined (__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050) && (__ARMCC_VERSION < 6100100)
+  #include "cmsis_armclang_ltm.h"
+
+  /*
+ * Arm Compiler above 6.10.1 (armclang)
+ */
+#elif defined (__ARMCC_VERSION) && (__ARMCC_VERSION >= 6100100)
+  #include "cmsis_armclang.h"
+
+
+/*
+ * GNU Compiler
+ */
+#elif defined ( __GNUC__ )
+  #include "cmsis_gcc.h"
+
+
+/*
+ * IAR Compiler
+ */
+#elif defined ( __ICCARM__ )
+  #include <cmsis_iccarm.h>
+
+
+/*
+ * TI Arm Compiler
+ */
+#elif defined ( __TI_ARM__ )
+  #include <cmsis_ccs.h>
+
+  #ifndef   __ASM
+    #define __ASM                                  __asm
+  #endif
+  #ifndef   __INLINE
+    #define __INLINE                               inline
+  #endif
+  #ifndef   __STATIC_INLINE
+    #define __STATIC_INLINE                        static inline
+  #endif
+  #ifndef   __STATIC_FORCEINLINE
+    #define __STATIC_FORCEINLINE                   __STATIC_INLINE
+  #endif
+  #ifndef   __NO_RETURN
+    #define __NO_RETURN                            __attribute__((noreturn))
+  #endif
+  #ifndef   __USED
+    #define __USED                                 __attribute__((used))
+  #endif
+  #ifndef   __WEAK
+    #define __WEAK                                 __attribute__((weak))
+  #endif
+  #ifndef   __PACKED
+    #define __PACKED                               __attribute__((packed))
+  #endif
+  #ifndef   __PACKED_STRUCT
+    #define __PACKED_STRUCT                        struct __attribute__((packed))
+  #endif
+  #ifndef   __PACKED_UNION
+    #define __PACKED_UNION                         union __attribute__((packed))
+  #endif
+  #ifndef   __UNALIGNED_UINT32        /* deprecated */
+    struct __attribute__((packed)) T_UINT32 { uint32_t v; };
+    #define __UNALIGNED_UINT32(x)                  (((struct T_UINT32 *)(x))->v)
+  #endif
+  #ifndef   __UNALIGNED_UINT16_WRITE
+    __PACKED_STRUCT T_UINT16_WRITE { uint16_t v; };
+    #define __UNALIGNED_UINT16_WRITE(addr, val)    (void)((((struct T_UINT16_WRITE *)(void*)(addr))->v) = (val))
+  #endif
+  #ifndef   __UNALIGNED_UINT16_READ
+    __PACKED_STRUCT T_UINT16_READ { uint16_t v; };
+    #define __UNALIGNED_UINT16_READ(addr)          (((const struct T_UINT16_READ *)(const void *)(addr))->v)
+  #endif
+  #ifndef   __UNALIGNED_UINT32_WRITE
+    __PACKED_STRUCT T_UINT32_WRITE { uint32_t v; };
+    #define __UNALIGNED_UINT32_WRITE(addr, val)    (void)((((struct T_UINT32_WRITE *)(void *)(addr))->v) = (val))
+  #endif
+  #ifndef   __UNALIGNED_UINT32_READ
+    __PACKED_STRUCT T_UINT32_READ { uint32_t v; };
+    #define __UNALIGNED_UINT32_READ(addr)          (((const struct T_UINT32_READ *)(const void *)(addr))->v)
+  #endif
+  #ifndef   __ALIGNED
+    #define __ALIGNED(x)                           __attribute__((aligned(x)))
+  #endif
+  #ifndef   __RESTRICT
+    #define __RESTRICT                             __restrict
+  #endif
+  #ifndef   __COMPILER_BARRIER
+    #warning No compiler specific solution for __COMPILER_BARRIER. __COMPILER_BARRIER is ignored.
+    #define __COMPILER_BARRIER()                   (void)0
+  #endif
+
+
+/*
+ * TASKING Compiler
+ */
+#elif defined ( __TASKING__ )
+  /*
+   * The CMSIS functions have been implemented as intrinsics in the compiler.
+   * Please use "carm -?i" to get an up to date list of all intrinsics,
+   * Including the CMSIS ones.
+   */
+
+  #ifndef   __ASM
+    #define __ASM                                  __asm
+  #endif
+  #ifndef   __INLINE
+    #define __INLINE                               inline
+  #endif
+  #ifndef   __STATIC_INLINE
+    #define __STATIC_INLINE                        static inline
+  #endif
+  #ifndef   __STATIC_FORCEINLINE
+    #define __STATIC_FORCEINLINE                   __STATIC_INLINE
+  #endif
+  #ifndef   __NO_RETURN
+    #define __NO_RETURN                            __attribute__((noreturn))
+  #endif
+  #ifndef   __USED
+    #define __USED                                 __attribute__((used))
+  #endif
+  #ifndef   __WEAK
+    #define __WEAK                                 __attribute__((weak))
+  #endif
+  #ifndef   __PACKED
+    #define __PACKED                               __packed__
+  #endif
+  #ifndef   __PACKED_STRUCT
+    #define __PACKED_STRUCT                        struct __packed__
+  #endif
+  #ifndef   __PACKED_UNION
+    #define __PACKED_UNION                         union __packed__
+  #endif
+  #ifndef   __UNALIGNED_UINT32        /* deprecated */
+    struct __packed__ T_UINT32 { uint32_t v; };
+    #define __UNALIGNED_UINT32(x)                  (((struct T_UINT32 *)(x))->v)
+  #endif
+  #ifndef   __UNALIGNED_UINT16_WRITE
+    __PACKED_STRUCT T_UINT16_WRITE { uint16_t v; };
+    #define __UNALIGNED_UINT16_WRITE(addr, val)    (void)((((struct T_UINT16_WRITE *)(void *)(addr))->v) = (val))
+  #endif
+  #ifndef   __UNALIGNED_UINT16_READ
+    __PACKED_STRUCT T_UINT16_READ { uint16_t v; };
+    #define __UNALIGNED_UINT16_READ(addr)          (((const struct T_UINT16_READ *)(const void *)(addr))->v)
+  #endif
+  #ifndef   __UNALIGNED_UINT32_WRITE
+    __PACKED_STRUCT T_UINT32_WRITE { uint32_t v; };
+    #define __UNALIGNED_UINT32_WRITE(addr, val)    (void)((((struct T_UINT32_WRITE *)(void *)(addr))->v) = (val))
+  #endif
+  #ifndef   __UNALIGNED_UINT32_READ
+    __PACKED_STRUCT T_UINT32_READ { uint32_t v; };
+    #define __UNALIGNED_UINT32_READ(addr)          (((const struct T_UINT32_READ *)(const void *)(addr))->v)
+  #endif
+  #ifndef   __ALIGNED
+    #define __ALIGNED(x)              __align(x)
+  #endif
+  #ifndef   __RESTRICT
+    #warning No compiler specific solution for __RESTRICT. __RESTRICT is ignored.
+    #define __RESTRICT
+  #endif
+  #ifndef   __COMPILER_BARRIER
+    #warning No compiler specific solution for __COMPILER_BARRIER. __COMPILER_BARRIER is ignored.
+    #define __COMPILER_BARRIER()                   (void)0
+  #endif
+
+
+/*
+ * COSMIC Compiler
+ */
+#elif defined ( __CSMC__ )
+   #include <cmsis_csm.h>
+
+ #ifndef   __ASM
+    #define __ASM                                  _asm
+  #endif
+  #ifndef   __INLINE
+    #define __INLINE                               inline
+  #endif
+  #ifndef   __STATIC_INLINE
+    #define __STATIC_INLINE                        static inline
+  #endif
+  #ifndef   __STATIC_FORCEINLINE
+    #define __STATIC_FORCEINLINE                   __STATIC_INLINE
+  #endif
+  #ifndef   __NO_RETURN
+    // NO RETURN is automatically detected hence no warning here
+    #define __NO_RETURN
+  #endif
+  #ifndef   __USED
+    #warning No compiler specific solution for __USED. __USED is ignored.
+    #define __USED
+  #endif
+  #ifndef   __WEAK
+    #define __WEAK                                 __weak
+  #endif
+  #ifndef   __PACKED
+    #define __PACKED                               @packed
+  #endif
+  #ifndef   __PACKED_STRUCT
+    #define __PACKED_STRUCT                        @packed struct
+  #endif
+  #ifndef   __PACKED_UNION
+    #define __PACKED_UNION                         @packed union
+  #endif
+  #ifndef   __UNALIGNED_UINT32        /* deprecated */
+    @packed struct T_UINT32 { uint32_t v; };
+    #define __UNALIGNED_UINT32(x)                  (((struct T_UINT32 *)(x))->v)
+  #endif
+  #ifndef   __UNALIGNED_UINT16_WRITE
+    __PACKED_STRUCT T_UINT16_WRITE { uint16_t v; };
+    #define __UNALIGNED_UINT16_WRITE(addr, val)    (void)((((struct T_UINT16_WRITE *)(void *)(addr))->v) = (val))
+  #endif
+  #ifndef   __UNALIGNED_UINT16_READ
+    __PACKED_STRUCT T_UINT16_READ { uint16_t v; };
+    #define __UNALIGNED_UINT16_READ(addr)          (((const struct T_UINT16_READ *)(const void *)(addr))->v)
+  #endif
+  #ifndef   __UNALIGNED_UINT32_WRITE
+    __PACKED_STRUCT T_UINT32_WRITE { uint32_t v; };
+    #define __UNALIGNED_UINT32_WRITE(addr, val)    (void)((((struct T_UINT32_WRITE *)(void *)(addr))->v) = (val))
+  #endif
+  #ifndef   __UNALIGNED_UINT32_READ
+    __PACKED_STRUCT T_UINT32_READ { uint32_t v; };
+    #define __UNALIGNED_UINT32_READ(addr)          (((const struct T_UINT32_READ *)(const void *)(addr))->v)
+  #endif
+  #ifndef   __ALIGNED
+    #warning No compiler specific solution for __ALIGNED. __ALIGNED is ignored.
+    #define __ALIGNED(x)
+  #endif
+  #ifndef   __RESTRICT
+    #warning No compiler specific solution for __RESTRICT. __RESTRICT is ignored.
+    #define __RESTRICT
+  #endif
+  #ifndef   __COMPILER_BARRIER
+    #warning No compiler specific solution for __COMPILER_BARRIER. __COMPILER_BARRIER is ignored.
+    #define __COMPILER_BARRIER()                   (void)0
+  #endif
+
+
+#else
+  #error Unknown compiler.
+#endif
+
+
+#endif /* __CMSIS_COMPILER_H */
+

+ 2177 - 0
libraries/cmsis/cm4/core_support/cmsis_gcc.h

@@ -0,0 +1,2177 @@
+/******************************************************************************
+ * @file     cmsis_gcc.h
+ * @brief    CMSIS compiler GCC header file
+ * @version  V5.3.0
+ * @date     26. March 2020
+ ******************************************************************************/
+/*
+ * Copyright (c) 2009-2020 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __CMSIS_GCC_H
+#define __CMSIS_GCC_H
+
+/* ignore some GCC warnings */
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wsign-conversion"
+#pragma GCC diagnostic ignored "-Wconversion"
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+
+/* Fallback for __has_builtin */
+#ifndef __has_builtin
+  #define __has_builtin(x) (0)
+#endif
+
+/* CMSIS compiler specific defines */
+#ifndef   __ASM
+  #define __ASM                                  __asm
+#endif
+#ifndef   __INLINE
+  #define __INLINE                               inline
+#endif
+#ifndef   __STATIC_INLINE
+  #define __STATIC_INLINE                        static inline
+#endif
+#ifndef   __STATIC_FORCEINLINE
+  #define __STATIC_FORCEINLINE                   __attribute__((always_inline)) static inline
+#endif
+#ifndef   __NO_RETURN
+  #define __NO_RETURN                            __attribute__((__noreturn__))
+#endif
+#ifndef   __USED
+  #define __USED                                 __attribute__((used))
+#endif
+#ifndef   __WEAK
+  #define __WEAK                                 __attribute__((weak))
+#endif
+#ifndef   __PACKED
+  #define __PACKED                               __attribute__((packed, aligned(1)))
+#endif
+#ifndef   __PACKED_STRUCT
+  #define __PACKED_STRUCT                        struct __attribute__((packed, aligned(1)))
+#endif
+#ifndef   __PACKED_UNION
+  #define __PACKED_UNION                         union __attribute__((packed, aligned(1)))
+#endif
+#ifndef   __UNALIGNED_UINT32        /* deprecated */
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wpacked"
+  #pragma GCC diagnostic ignored "-Wattributes"
+  struct __attribute__((packed)) T_UINT32 { uint32_t v; };
+  #pragma GCC diagnostic pop
+  #define __UNALIGNED_UINT32(x)                  (((struct T_UINT32 *)(x))->v)
+#endif
+#ifndef   __UNALIGNED_UINT16_WRITE
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wpacked"
+  #pragma GCC diagnostic ignored "-Wattributes"
+  __PACKED_STRUCT T_UINT16_WRITE { uint16_t v; };
+  #pragma GCC diagnostic pop
+  #define __UNALIGNED_UINT16_WRITE(addr, val)    (void)((((struct T_UINT16_WRITE *)(void *)(addr))->v) = (val))
+#endif
+#ifndef   __UNALIGNED_UINT16_READ
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wpacked"
+  #pragma GCC diagnostic ignored "-Wattributes"
+  __PACKED_STRUCT T_UINT16_READ { uint16_t v; };
+  #pragma GCC diagnostic pop
+  #define __UNALIGNED_UINT16_READ(addr)          (((const struct T_UINT16_READ *)(const void *)(addr))->v)
+#endif
+#ifndef   __UNALIGNED_UINT32_WRITE
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wpacked"
+  #pragma GCC diagnostic ignored "-Wattributes"
+  __PACKED_STRUCT T_UINT32_WRITE { uint32_t v; };
+  #pragma GCC diagnostic pop
+  #define __UNALIGNED_UINT32_WRITE(addr, val)    (void)((((struct T_UINT32_WRITE *)(void *)(addr))->v) = (val))
+#endif
+#ifndef   __UNALIGNED_UINT32_READ
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wpacked"
+  #pragma GCC diagnostic ignored "-Wattributes"
+  __PACKED_STRUCT T_UINT32_READ { uint32_t v; };
+  #pragma GCC diagnostic pop
+  #define __UNALIGNED_UINT32_READ(addr)          (((const struct T_UINT32_READ *)(const void *)(addr))->v)
+#endif
+#ifndef   __ALIGNED
+  #define __ALIGNED(x)                           __attribute__((aligned(x)))
+#endif
+#ifndef   __RESTRICT
+  #define __RESTRICT                             __restrict
+#endif
+#ifndef   __COMPILER_BARRIER
+  #define __COMPILER_BARRIER()                   __ASM volatile("":::"memory")
+#endif
+
+/* #########################  Startup and Lowlevel Init  ######################## */
+
+#ifndef __PROGRAM_START
+
+/**
+  \brief   Initializes data and bss sections
+  \details This default implementations initialized all data and additional bss
+           sections relying on .copy.table and .zero.table specified properly
+           in the used linker script.
+
+ */
+__STATIC_FORCEINLINE __NO_RETURN void __cmsis_start(void)
+{
+  extern void _start(void) __NO_RETURN;
+
+  typedef struct {
+    uint32_t const* src;
+    uint32_t* dest;
+    uint32_t  wlen;
+  } __copy_table_t;
+
+  typedef struct {
+    uint32_t* dest;
+    uint32_t  wlen;
+  } __zero_table_t;
+
+  extern const __copy_table_t __copy_table_start__;
+  extern const __copy_table_t __copy_table_end__;
+  extern const __zero_table_t __zero_table_start__;
+  extern const __zero_table_t __zero_table_end__;
+
+  for (__copy_table_t const* pTable = &__copy_table_start__; pTable < &__copy_table_end__; ++pTable) {
+    for(uint32_t i=0u; i<pTable->wlen; ++i) {
+      pTable->dest[i] = pTable->src[i];
+    }
+  }
+
+  for (__zero_table_t const* pTable = &__zero_table_start__; pTable < &__zero_table_end__; ++pTable) {
+    for(uint32_t i=0u; i<pTable->wlen; ++i) {
+      pTable->dest[i] = 0u;
+    }
+  }
+
+  _start();
+}
+
+#define __PROGRAM_START           __cmsis_start
+#endif
+
+#ifndef __INITIAL_SP
+#define __INITIAL_SP              __StackTop
+#endif
+
+#ifndef __STACK_LIMIT
+#define __STACK_LIMIT             __StackLimit
+#endif
+
+#ifndef __VECTOR_TABLE
+#define __VECTOR_TABLE            __Vectors
+#endif
+
+#ifndef __VECTOR_TABLE_ATTRIBUTE
+#define __VECTOR_TABLE_ATTRIBUTE  __attribute__((used, section(".vectors")))
+#endif
+
+/* ###########################  Core Function Access  ########################### */
+/** \ingroup  CMSIS_Core_FunctionInterface
+    \defgroup CMSIS_Core_RegAccFunctions CMSIS Core Register Access Functions
+  @{
+ */
+
+/**
+  \brief   Enable IRQ Interrupts
+  \details Enables IRQ interrupts by clearing the I-bit in the CPSR.
+           Can only be executed in Privileged modes.
+ */
+__STATIC_FORCEINLINE void __enable_irq(void)
+{
+  __ASM volatile ("cpsie i" : : : "memory");
+}
+
+
+/**
+  \brief   Disable IRQ Interrupts
+  \details Disables IRQ interrupts by setting the I-bit in the CPSR.
+           Can only be executed in Privileged modes.
+ */
+__STATIC_FORCEINLINE void __disable_irq(void)
+{
+  __ASM volatile ("cpsid i" : : : "memory");
+}
+
+
+/**
+  \brief   Get Control Register
+  \details Returns the content of the Control Register.
+  \return               Control Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_CONTROL(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, control" : "=r" (result) );
+  return(result);
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Control Register (non-secure)
+  \details Returns the content of the non-secure Control Register when in secure mode.
+  \return               non-secure Control Register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_CONTROL_NS(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, control_ns" : "=r" (result) );
+  return(result);
+}
+#endif
+
+
+/**
+  \brief   Set Control Register
+  \details Writes the given value to the Control Register.
+  \param [in]    control  Control Register value to set
+ */
+__STATIC_FORCEINLINE void __set_CONTROL(uint32_t control)
+{
+  __ASM volatile ("MSR control, %0" : : "r" (control) : "memory");
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Set Control Register (non-secure)
+  \details Writes the given value to the non-secure Control Register when in secure state.
+  \param [in]    control  Control Register value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_CONTROL_NS(uint32_t control)
+{
+  __ASM volatile ("MSR control_ns, %0" : : "r" (control) : "memory");
+}
+#endif
+
+
+/**
+  \brief   Get IPSR Register
+  \details Returns the content of the IPSR Register.
+  \return               IPSR Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_IPSR(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, ipsr" : "=r" (result) );
+  return(result);
+}
+
+
+/**
+  \brief   Get APSR Register
+  \details Returns the content of the APSR Register.
+  \return               APSR Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_APSR(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, apsr" : "=r" (result) );
+  return(result);
+}
+
+
+/**
+  \brief   Get xPSR Register
+  \details Returns the content of the xPSR Register.
+  \return               xPSR Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_xPSR(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, xpsr" : "=r" (result) );
+  return(result);
+}
+
+
+/**
+  \brief   Get Process Stack Pointer
+  \details Returns the current value of the Process Stack Pointer (PSP).
+  \return               PSP Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_PSP(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, psp"  : "=r" (result) );
+  return(result);
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Process Stack Pointer (non-secure)
+  \details Returns the current value of the non-secure Process Stack Pointer (PSP) when in secure state.
+  \return               PSP Register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_PSP_NS(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, psp_ns"  : "=r" (result) );
+  return(result);
+}
+#endif
+
+
+/**
+  \brief   Set Process Stack Pointer
+  \details Assigns the given value to the Process Stack Pointer (PSP).
+  \param [in]    topOfProcStack  Process Stack Pointer value to set
+ */
+__STATIC_FORCEINLINE void __set_PSP(uint32_t topOfProcStack)
+{
+  __ASM volatile ("MSR psp, %0" : : "r" (topOfProcStack) : );
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Set Process Stack Pointer (non-secure)
+  \details Assigns the given value to the non-secure Process Stack Pointer (PSP) when in secure state.
+  \param [in]    topOfProcStack  Process Stack Pointer value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_PSP_NS(uint32_t topOfProcStack)
+{
+  __ASM volatile ("MSR psp_ns, %0" : : "r" (topOfProcStack) : );
+}
+#endif
+
+
+/**
+  \brief   Get Main Stack Pointer
+  \details Returns the current value of the Main Stack Pointer (MSP).
+  \return               MSP Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_MSP(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, msp" : "=r" (result) );
+  return(result);
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Main Stack Pointer (non-secure)
+  \details Returns the current value of the non-secure Main Stack Pointer (MSP) when in secure state.
+  \return               MSP Register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_MSP_NS(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, msp_ns" : "=r" (result) );
+  return(result);
+}
+#endif
+
+
+/**
+  \brief   Set Main Stack Pointer
+  \details Assigns the given value to the Main Stack Pointer (MSP).
+  \param [in]    topOfMainStack  Main Stack Pointer value to set
+ */
+__STATIC_FORCEINLINE void __set_MSP(uint32_t topOfMainStack)
+{
+  __ASM volatile ("MSR msp, %0" : : "r" (topOfMainStack) : );
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Set Main Stack Pointer (non-secure)
+  \details Assigns the given value to the non-secure Main Stack Pointer (MSP) when in secure state.
+  \param [in]    topOfMainStack  Main Stack Pointer value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_MSP_NS(uint32_t topOfMainStack)
+{
+  __ASM volatile ("MSR msp_ns, %0" : : "r" (topOfMainStack) : );
+}
+#endif
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Stack Pointer (non-secure)
+  \details Returns the current value of the non-secure Stack Pointer (SP) when in secure state.
+  \return               SP Register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_SP_NS(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, sp_ns" : "=r" (result) );
+  return(result);
+}
+
+
+/**
+  \brief   Set Stack Pointer (non-secure)
+  \details Assigns the given value to the non-secure Stack Pointer (SP) when in secure state.
+  \param [in]    topOfStack  Stack Pointer value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_SP_NS(uint32_t topOfStack)
+{
+  __ASM volatile ("MSR sp_ns, %0" : : "r" (topOfStack) : );
+}
+#endif
+
+
+/**
+  \brief   Get Priority Mask
+  \details Returns the current state of the priority mask bit from the Priority Mask Register.
+  \return               Priority Mask value
+ */
+__STATIC_FORCEINLINE uint32_t __get_PRIMASK(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, primask" : "=r" (result) );
+  return(result);
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Priority Mask (non-secure)
+  \details Returns the current state of the non-secure priority mask bit from the Priority Mask Register when in secure state.
+  \return               Priority Mask value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_PRIMASK_NS(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, primask_ns" : "=r" (result) );
+  return(result);
+}
+#endif
+
+
+/**
+  \brief   Set Priority Mask
+  \details Assigns the given value to the Priority Mask Register.
+  \param [in]    priMask  Priority Mask
+ */
+__STATIC_FORCEINLINE void __set_PRIMASK(uint32_t priMask)
+{
+  __ASM volatile ("MSR primask, %0" : : "r" (priMask) : "memory");
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Set Priority Mask (non-secure)
+  \details Assigns the given value to the non-secure Priority Mask Register when in secure state.
+  \param [in]    priMask  Priority Mask
+ */
+__STATIC_FORCEINLINE void __TZ_set_PRIMASK_NS(uint32_t priMask)
+{
+  __ASM volatile ("MSR primask_ns, %0" : : "r" (priMask) : "memory");
+}
+#endif
+
+
+#if ((defined (__ARM_ARCH_7M__      ) && (__ARM_ARCH_7M__      == 1)) || \
+     (defined (__ARM_ARCH_7EM__     ) && (__ARM_ARCH_7EM__     == 1)) || \
+     (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1))    )
+/**
+  \brief   Enable FIQ
+  \details Enables FIQ interrupts by clearing the F-bit in the CPSR.
+           Can only be executed in Privileged modes.
+ */
+__STATIC_FORCEINLINE void __enable_fault_irq(void)
+{
+  __ASM volatile ("cpsie f" : : : "memory");
+}
+
+
+/**
+  \brief   Disable FIQ
+  \details Disables FIQ interrupts by setting the F-bit in the CPSR.
+           Can only be executed in Privileged modes.
+ */
+__STATIC_FORCEINLINE void __disable_fault_irq(void)
+{
+  __ASM volatile ("cpsid f" : : : "memory");
+}
+
+
+/**
+  \brief   Get Base Priority
+  \details Returns the current value of the Base Priority register.
+  \return               Base Priority register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_BASEPRI(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, basepri" : "=r" (result) );
+  return(result);
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Base Priority (non-secure)
+  \details Returns the current value of the non-secure Base Priority register when in secure state.
+  \return               Base Priority register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_BASEPRI_NS(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, basepri_ns" : "=r" (result) );
+  return(result);
+}
+#endif
+
+
+/**
+  \brief   Set Base Priority
+  \details Assigns the given value to the Base Priority register.
+  \param [in]    basePri  Base Priority value to set
+ */
+__STATIC_FORCEINLINE void __set_BASEPRI(uint32_t basePri)
+{
+  __ASM volatile ("MSR basepri, %0" : : "r" (basePri) : "memory");
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Set Base Priority (non-secure)
+  \details Assigns the given value to the non-secure Base Priority register when in secure state.
+  \param [in]    basePri  Base Priority value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_BASEPRI_NS(uint32_t basePri)
+{
+  __ASM volatile ("MSR basepri_ns, %0" : : "r" (basePri) : "memory");
+}
+#endif
+
+
+/**
+  \brief   Set Base Priority with condition
+  \details Assigns the given value to the Base Priority register only if BASEPRI masking is disabled,
+           or the new value increases the BASEPRI priority level.
+  \param [in]    basePri  Base Priority value to set
+ */
+__STATIC_FORCEINLINE void __set_BASEPRI_MAX(uint32_t basePri)
+{
+  __ASM volatile ("MSR basepri_max, %0" : : "r" (basePri) : "memory");
+}
+
+
+/**
+  \brief   Get Fault Mask
+  \details Returns the current value of the Fault Mask register.
+  \return               Fault Mask register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_FAULTMASK(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, faultmask" : "=r" (result) );
+  return(result);
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Fault Mask (non-secure)
+  \details Returns the current value of the non-secure Fault Mask register when in secure state.
+  \return               Fault Mask register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_FAULTMASK_NS(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, faultmask_ns" : "=r" (result) );
+  return(result);
+}
+#endif
+
+
+/**
+  \brief   Set Fault Mask
+  \details Assigns the given value to the Fault Mask register.
+  \param [in]    faultMask  Fault Mask value to set
+ */
+__STATIC_FORCEINLINE void __set_FAULTMASK(uint32_t faultMask)
+{
+  __ASM volatile ("MSR faultmask, %0" : : "r" (faultMask) : "memory");
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Set Fault Mask (non-secure)
+  \details Assigns the given value to the non-secure Fault Mask register when in secure state.
+  \param [in]    faultMask  Fault Mask value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_FAULTMASK_NS(uint32_t faultMask)
+{
+  __ASM volatile ("MSR faultmask_ns, %0" : : "r" (faultMask) : "memory");
+}
+#endif
+
+#endif /* ((defined (__ARM_ARCH_7M__      ) && (__ARM_ARCH_7M__      == 1)) || \
+           (defined (__ARM_ARCH_7EM__     ) && (__ARM_ARCH_7EM__     == 1)) || \
+           (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1))    ) */
+
+
+#if ((defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \
+     (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1))    )
+
+/**
+  \brief   Get Process Stack Pointer Limit
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence zero is returned always in non-secure
+  mode.
+
+  \details Returns the current value of the Process Stack Pointer Limit (PSPLIM).
+  \return               PSPLIM Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_PSPLIM(void)
+{
+#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) && \
+    (!defined (__ARM_FEATURE_CMSE) || (__ARM_FEATURE_CMSE < 3)))
+    // without main extensions, the non-secure PSPLIM is RAZ/WI
+  return 0U;
+#else
+  uint32_t result;
+  __ASM volatile ("MRS %0, psplim"  : "=r" (result) );
+  return result;
+#endif
+}
+
+#if (defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Process Stack Pointer Limit (non-secure)
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence zero is returned always.
+
+  \details Returns the current value of the non-secure Process Stack Pointer Limit (PSPLIM) when in secure state.
+  \return               PSPLIM Register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_PSPLIM_NS(void)
+{
+#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)))
+  // without main extensions, the non-secure PSPLIM is RAZ/WI
+  return 0U;
+#else
+  uint32_t result;
+  __ASM volatile ("MRS %0, psplim_ns"  : "=r" (result) );
+  return result;
+#endif
+}
+#endif
+
+
+/**
+  \brief   Set Process Stack Pointer Limit
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence the write is silently ignored in non-secure
+  mode.
+
+  \details Assigns the given value to the Process Stack Pointer Limit (PSPLIM).
+  \param [in]    ProcStackPtrLimit  Process Stack Pointer Limit value to set
+ */
+__STATIC_FORCEINLINE void __set_PSPLIM(uint32_t ProcStackPtrLimit)
+{
+#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) && \
+    (!defined (__ARM_FEATURE_CMSE) || (__ARM_FEATURE_CMSE < 3)))
+  // without main extensions, the non-secure PSPLIM is RAZ/WI
+  (void)ProcStackPtrLimit;
+#else
+  __ASM volatile ("MSR psplim, %0" : : "r" (ProcStackPtrLimit));
+#endif
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE  ) && (__ARM_FEATURE_CMSE   == 3))
+/**
+  \brief   Set Process Stack Pointer (non-secure)
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence the write is silently ignored.
+
+  \details Assigns the given value to the non-secure Process Stack Pointer Limit (PSPLIM) when in secure state.
+  \param [in]    ProcStackPtrLimit  Process Stack Pointer Limit value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_PSPLIM_NS(uint32_t ProcStackPtrLimit)
+{
+#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)))
+  // without main extensions, the non-secure PSPLIM is RAZ/WI
+  (void)ProcStackPtrLimit;
+#else
+  __ASM volatile ("MSR psplim_ns, %0\n" : : "r" (ProcStackPtrLimit));
+#endif
+}
+#endif
+
+
+/**
+  \brief   Get Main Stack Pointer Limit
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence zero is returned always in non-secure
+  mode.
+
+  \details Returns the current value of the Main Stack Pointer Limit (MSPLIM).
+  \return               MSPLIM Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_MSPLIM(void)
+{
+#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) && \
+    (!defined (__ARM_FEATURE_CMSE) || (__ARM_FEATURE_CMSE < 3)))
+  // without main extensions, the non-secure MSPLIM is RAZ/WI
+  return 0U;
+#else
+  uint32_t result;
+  __ASM volatile ("MRS %0, msplim" : "=r" (result) );
+  return result;
+#endif
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE  ) && (__ARM_FEATURE_CMSE   == 3))
+/**
+  \brief   Get Main Stack Pointer Limit (non-secure)
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence zero is returned always.
+
+  \details Returns the current value of the non-secure Main Stack Pointer Limit(MSPLIM) when in secure state.
+  \return               MSPLIM Register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_MSPLIM_NS(void)
+{
+#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)))
+  // without main extensions, the non-secure MSPLIM is RAZ/WI
+  return 0U;
+#else
+  uint32_t result;
+  __ASM volatile ("MRS %0, msplim_ns" : "=r" (result) );
+  return result;
+#endif
+}
+#endif
+
+
+/**
+  \brief   Set Main Stack Pointer Limit
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence the write is silently ignored in non-secure
+  mode.
+
+  \details Assigns the given value to the Main Stack Pointer Limit (MSPLIM).
+  \param [in]    MainStackPtrLimit  Main Stack Pointer Limit value to set
+ */
+__STATIC_FORCEINLINE void __set_MSPLIM(uint32_t MainStackPtrLimit)
+{
+#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) && \
+    (!defined (__ARM_FEATURE_CMSE) || (__ARM_FEATURE_CMSE < 3)))
+  // without main extensions, the non-secure MSPLIM is RAZ/WI
+  (void)MainStackPtrLimit;
+#else
+  __ASM volatile ("MSR msplim, %0" : : "r" (MainStackPtrLimit));
+#endif
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE  ) && (__ARM_FEATURE_CMSE   == 3))
+/**
+  \brief   Set Main Stack Pointer Limit (non-secure)
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence the write is silently ignored.
+
+  \details Assigns the given value to the non-secure Main Stack Pointer Limit (MSPLIM) when in secure state.
+  \param [in]    MainStackPtrLimit  Main Stack Pointer value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_MSPLIM_NS(uint32_t MainStackPtrLimit)
+{
+#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)))
+  // without main extensions, the non-secure MSPLIM is RAZ/WI
+  (void)MainStackPtrLimit;
+#else
+  __ASM volatile ("MSR msplim_ns, %0" : : "r" (MainStackPtrLimit));
+#endif
+}
+#endif
+
+#endif /* ((defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \
+           (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1))    ) */
+
+
+/**
+  \brief   Get FPSCR
+  \details Returns the current value of the Floating Point Status/Control register.
+  \return               Floating Point Status/Control register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_FPSCR(void)
+{
+#if ((defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)) && \
+     (defined (__FPU_USED   ) && (__FPU_USED    == 1U))     )
+#if __has_builtin(__builtin_arm_get_fpscr)
+// Re-enable using built-in when GCC has been fixed
+// || (__GNUC__ > 7) || (__GNUC__ == 7 && __GNUC_MINOR__ >= 2)
+  /* see https://gcc.gnu.org/ml/gcc-patches/2017-04/msg00443.html */
+  return __builtin_arm_get_fpscr();
+#else
+  uint32_t result;
+
+  __ASM volatile ("VMRS %0, fpscr" : "=r" (result) );
+  return(result);
+#endif
+#else
+  return(0U);
+#endif
+}
+
+
+/**
+  \brief   Set FPSCR
+  \details Assigns the given value to the Floating Point Status/Control register.
+  \param [in]    fpscr  Floating Point Status/Control value to set
+ */
+__STATIC_FORCEINLINE void __set_FPSCR(uint32_t fpscr)
+{
+#if ((defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)) && \
+     (defined (__FPU_USED   ) && (__FPU_USED    == 1U))     )
+#if __has_builtin(__builtin_arm_set_fpscr)
+// Re-enable using built-in when GCC has been fixed
+// || (__GNUC__ > 7) || (__GNUC__ == 7 && __GNUC_MINOR__ >= 2)
+  /* see https://gcc.gnu.org/ml/gcc-patches/2017-04/msg00443.html */
+  __builtin_arm_set_fpscr(fpscr);
+#else
+  __ASM volatile ("VMSR fpscr, %0" : : "r" (fpscr) : "vfpcc", "memory");
+#endif
+#else
+  (void)fpscr;
+#endif
+}
+
+
+/*@} end of CMSIS_Core_RegAccFunctions */
+
+
+/* ##########################  Core Instruction Access  ######################### */
+/** \defgroup CMSIS_Core_InstructionInterface CMSIS Core Instruction Interface
+  Access to dedicated instructions
+  @{
+*/
+
+/* Define macros for porting to both thumb1 and thumb2.
+ * For thumb1, use low register (r0-r7), specified by constraint "l"
+ * Otherwise, use general registers, specified by constraint "r" */
+#if defined (__thumb__) && !defined (__thumb2__)
+#define __CMSIS_GCC_OUT_REG(r) "=l" (r)
+#define __CMSIS_GCC_RW_REG(r) "+l" (r)
+#define __CMSIS_GCC_USE_REG(r) "l" (r)
+#else
+#define __CMSIS_GCC_OUT_REG(r) "=r" (r)
+#define __CMSIS_GCC_RW_REG(r) "+r" (r)
+#define __CMSIS_GCC_USE_REG(r) "r" (r)
+#endif
+
+/**
+  \brief   No Operation
+  \details No Operation does nothing. This instruction can be used for code alignment purposes.
+ */
+#define __NOP()                             __ASM volatile ("nop")
+
+/**
+  \brief   Wait For Interrupt
+  \details Wait For Interrupt is a hint instruction that suspends execution until one of a number of events occurs.
+ */
+#define __WFI()                             __ASM volatile ("wfi":::"memory")
+
+
+/**
+  \brief   Wait For Event
+  \details Wait For Event is a hint instruction that permits the processor to enter
+           a low-power state until one of a number of events occurs.
+ */
+#define __WFE()                             __ASM volatile ("wfe":::"memory")
+
+
+/**
+  \brief   Send Event
+  \details Send Event is a hint instruction. It causes an event to be signaled to the CPU.
+ */
+#define __SEV()                             __ASM volatile ("sev")
+
+
+/**
+  \brief   Instruction Synchronization Barrier
+  \details Instruction Synchronization Barrier flushes the pipeline in the processor,
+           so that all instructions following the ISB are fetched from cache or memory,
+           after the instruction has been completed.
+ */
+__STATIC_FORCEINLINE void __ISB(void)
+{
+  __ASM volatile ("isb 0xF":::"memory");
+}
+
+
+/**
+  \brief   Data Synchronization Barrier
+  \details Acts as a special kind of Data Memory Barrier.
+           It completes when all explicit memory accesses before this instruction complete.
+ */
+__STATIC_FORCEINLINE void __DSB(void)
+{
+  __ASM volatile ("dsb 0xF":::"memory");
+}
+
+
+/**
+  \brief   Data Memory Barrier
+  \details Ensures the apparent order of the explicit memory operations before
+           and after the instruction, without ensuring their completion.
+ */
+__STATIC_FORCEINLINE void __DMB(void)
+{
+  __ASM volatile ("dmb 0xF":::"memory");
+}
+
+
+/**
+  \brief   Reverse byte order (32 bit)
+  \details Reverses the byte order in unsigned integer value. For example, 0x12345678 becomes 0x78563412.
+  \param [in]    value  Value to reverse
+  \return               Reversed value
+ */
+__STATIC_FORCEINLINE uint32_t __REV(uint32_t value)
+{
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)
+  return __builtin_bswap32(value);
+#else
+  uint32_t result;
+
+  __ASM ("rev %0, %1" : __CMSIS_GCC_OUT_REG (result) : __CMSIS_GCC_USE_REG (value) );
+  return result;
+#endif
+}
+
+
+/**
+  \brief   Reverse byte order (16 bit)
+  \details Reverses the byte order within each halfword of a word. For example, 0x12345678 becomes 0x34127856.
+  \param [in]    value  Value to reverse
+  \return               Reversed value
+ */
+__STATIC_FORCEINLINE uint32_t __REV16(uint32_t value)
+{
+  uint32_t result;
+
+  __ASM ("rev16 %0, %1" : __CMSIS_GCC_OUT_REG (result) : __CMSIS_GCC_USE_REG (value) );
+  return result;
+}
+
+
+/**
+  \brief   Reverse byte order (16 bit)
+  \details Reverses the byte order in a 16-bit value and returns the signed 16-bit result. For example, 0x0080 becomes 0x8000.
+  \param [in]    value  Value to reverse
+  \return               Reversed value
+ */
+__STATIC_FORCEINLINE int16_t __REVSH(int16_t value)
+{
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
+  return (int16_t)__builtin_bswap16(value);
+#else
+  int16_t result;
+
+  __ASM ("revsh %0, %1" : __CMSIS_GCC_OUT_REG (result) : __CMSIS_GCC_USE_REG (value) );
+  return result;
+#endif
+}
+
+
+/**
+  \brief   Rotate Right in unsigned value (32 bit)
+  \details Rotate Right (immediate) provides the value of the contents of a register rotated by a variable number of bits.
+  \param [in]    op1  Value to rotate
+  \param [in]    op2  Number of Bits to rotate
+  \return               Rotated value
+ */
+__STATIC_FORCEINLINE uint32_t __ROR(uint32_t op1, uint32_t op2)
+{
+  op2 %= 32U;
+  if (op2 == 0U)
+  {
+    return op1;
+  }
+  return (op1 >> op2) | (op1 << (32U - op2));
+}
+
+
+/**
+  \brief   Breakpoint
+  \details Causes the processor to enter Debug state.
+           Debug tools can use this to investigate system state when the instruction at a particular address is reached.
+  \param [in]    value  is ignored by the processor.
+                 If required, a debugger can use it to store additional information about the breakpoint.
+ */
+#define __BKPT(value)                       __ASM volatile ("bkpt "#value)
+
+
+/**
+  \brief   Reverse bit order of value
+  \details Reverses the bit order of the given value.
+  \param [in]    value  Value to reverse
+  \return               Reversed value
+ */
+__STATIC_FORCEINLINE uint32_t __RBIT(uint32_t value)
+{
+  uint32_t result;
+
+#if ((defined (__ARM_ARCH_7M__      ) && (__ARM_ARCH_7M__      == 1)) || \
+     (defined (__ARM_ARCH_7EM__     ) && (__ARM_ARCH_7EM__     == 1)) || \
+     (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1))    )
+   __ASM ("rbit %0, %1" : "=r" (result) : "r" (value) );
+#else
+  uint32_t s = (4U /*sizeof(v)*/ * 8U) - 1U; /* extra shift needed at end */
+
+  result = value;                      /* r will be reversed bits of v; first get LSB of v */
+  for (value >>= 1U; value != 0U; value >>= 1U)
+  {
+    result <<= 1U;
+    result |= value & 1U;
+    s--;
+  }
+  result <<= s;                        /* shift when v's highest bits are zero */
+#endif
+  return result;
+}
+
+
+/**
+  \brief   Count leading zeros
+  \details Counts the number of leading zeros of a data value.
+  \param [in]  value  Value to count the leading zeros
+  \return             number of leading zeros in value
+ */
+__STATIC_FORCEINLINE uint8_t __CLZ(uint32_t value)
+{
+  /* Even though __builtin_clz produces a CLZ instruction on ARM, formally
+     __builtin_clz(0) is undefined behaviour, so handle this case specially.
+     This guarantees ARM-compatible results if happening to compile on a non-ARM
+     target, and ensures the compiler doesn't decide to activate any
+     optimisations using the logic "value was passed to __builtin_clz, so it
+     is non-zero".
+     ARM GCC 7.3 and possibly earlier will optimise this test away, leaving a
+     single CLZ instruction.
+   */
+  if (value == 0U)
+  {
+    return 32U;
+  }
+  return __builtin_clz(value);
+}
+
+
+#if ((defined (__ARM_ARCH_7M__      ) && (__ARM_ARCH_7M__      == 1)) || \
+     (defined (__ARM_ARCH_7EM__     ) && (__ARM_ARCH_7EM__     == 1)) || \
+     (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \
+     (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1))    )
+/**
+  \brief   LDR Exclusive (8 bit)
+  \details Executes a exclusive LDR instruction for 8 bit value.
+  \param [in]    ptr  Pointer to data
+  \return             value of type uint8_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint8_t __LDREXB(volatile uint8_t *addr)
+{
+    uint32_t result;
+
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
+   __ASM volatile ("ldrexb %0, %1" : "=r" (result) : "Q" (*addr) );
+#else
+    /* Prior to GCC 4.8, "Q" will be expanded to [rx, #0] which is not
+       accepted by assembler. So has to use following less efficient pattern.
+    */
+   __ASM volatile ("ldrexb %0, [%1]" : "=r" (result) : "r" (addr) : "memory" );
+#endif
+   return ((uint8_t) result);    /* Add explicit type cast here */
+}
+
+
+/**
+  \brief   LDR Exclusive (16 bit)
+  \details Executes a exclusive LDR instruction for 16 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint16_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint16_t __LDREXH(volatile uint16_t *addr)
+{
+    uint32_t result;
+
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
+   __ASM volatile ("ldrexh %0, %1" : "=r" (result) : "Q" (*addr) );
+#else
+    /* Prior to GCC 4.8, "Q" will be expanded to [rx, #0] which is not
+       accepted by assembler. So has to use following less efficient pattern.
+    */
+   __ASM volatile ("ldrexh %0, [%1]" : "=r" (result) : "r" (addr) : "memory" );
+#endif
+   return ((uint16_t) result);    /* Add explicit type cast here */
+}
+
+
+/**
+  \brief   LDR Exclusive (32 bit)
+  \details Executes a exclusive LDR instruction for 32 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint32_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint32_t __LDREXW(volatile uint32_t *addr)
+{
+    uint32_t result;
+
+   __ASM volatile ("ldrex %0, %1" : "=r" (result) : "Q" (*addr) );
+   return(result);
+}
+
+
+/**
+  \brief   STR Exclusive (8 bit)
+  \details Executes a exclusive STR instruction for 8 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+  \return          0  Function succeeded
+  \return          1  Function failed
+ */
+__STATIC_FORCEINLINE uint32_t __STREXB(uint8_t value, volatile uint8_t *addr)
+{
+   uint32_t result;
+
+   __ASM volatile ("strexb %0, %2, %1" : "=&r" (result), "=Q" (*addr) : "r" ((uint32_t)value) );
+   return(result);
+}
+
+
+/**
+  \brief   STR Exclusive (16 bit)
+  \details Executes a exclusive STR instruction for 16 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+  \return          0  Function succeeded
+  \return          1  Function failed
+ */
+__STATIC_FORCEINLINE uint32_t __STREXH(uint16_t value, volatile uint16_t *addr)
+{
+   uint32_t result;
+
+   __ASM volatile ("strexh %0, %2, %1" : "=&r" (result), "=Q" (*addr) : "r" ((uint32_t)value) );
+   return(result);
+}
+
+
+/**
+  \brief   STR Exclusive (32 bit)
+  \details Executes a exclusive STR instruction for 32 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+  \return          0  Function succeeded
+  \return          1  Function failed
+ */
+__STATIC_FORCEINLINE uint32_t __STREXW(uint32_t value, volatile uint32_t *addr)
+{
+   uint32_t result;
+
+   __ASM volatile ("strex %0, %2, %1" : "=&r" (result), "=Q" (*addr) : "r" (value) );
+   return(result);
+}
+
+
+/**
+  \brief   Remove the exclusive lock
+  \details Removes the exclusive lock which is created by LDREX.
+ */
+__STATIC_FORCEINLINE void __CLREX(void)
+{
+  __ASM volatile ("clrex" ::: "memory");
+}
+
+#endif /* ((defined (__ARM_ARCH_7M__      ) && (__ARM_ARCH_7M__      == 1)) || \
+           (defined (__ARM_ARCH_7EM__     ) && (__ARM_ARCH_7EM__     == 1)) || \
+           (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \
+           (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1))    ) */
+
+
+#if ((defined (__ARM_ARCH_7M__      ) && (__ARM_ARCH_7M__      == 1)) || \
+     (defined (__ARM_ARCH_7EM__     ) && (__ARM_ARCH_7EM__     == 1)) || \
+     (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1))    )
+/**
+  \brief   Signed Saturate
+  \details Saturates a signed value.
+  \param [in]  ARG1  Value to be saturated
+  \param [in]  ARG2  Bit position to saturate to (1..32)
+  \return             Saturated value
+ */
+#define __SSAT(ARG1, ARG2) \
+__extension__ \
+({                          \
+  int32_t __RES, __ARG1 = (ARG1); \
+  __ASM volatile ("ssat %0, %1, %2" : "=r" (__RES) :  "I" (ARG2), "r" (__ARG1) : "cc" ); \
+  __RES; \
+ })
+
+
+/**
+  \brief   Unsigned Saturate
+  \details Saturates an unsigned value.
+  \param [in]  ARG1  Value to be saturated
+  \param [in]  ARG2  Bit position to saturate to (0..31)
+  \return             Saturated value
+ */
+#define __USAT(ARG1, ARG2) \
+ __extension__ \
+({                          \
+  uint32_t __RES, __ARG1 = (ARG1); \
+  __ASM volatile ("usat %0, %1, %2" : "=r" (__RES) :  "I" (ARG2), "r" (__ARG1) : "cc" ); \
+  __RES; \
+ })
+
+
+/**
+  \brief   Rotate Right with Extend (32 bit)
+  \details Moves each bit of a bitstring right by one bit.
+           The carry input is shifted in at the left end of the bitstring.
+  \param [in]    value  Value to rotate
+  \return               Rotated value
+ */
+__STATIC_FORCEINLINE uint32_t __RRX(uint32_t value)
+{
+  uint32_t result;
+
+  __ASM volatile ("rrx %0, %1" : __CMSIS_GCC_OUT_REG (result) : __CMSIS_GCC_USE_REG (value) );
+  return(result);
+}
+
+
+/**
+  \brief   LDRT Unprivileged (8 bit)
+  \details Executes a Unprivileged LDRT instruction for 8 bit value.
+  \param [in]    ptr  Pointer to data
+  \return             value of type uint8_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint8_t __LDRBT(volatile uint8_t *ptr)
+{
+    uint32_t result;
+
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
+   __ASM volatile ("ldrbt %0, %1" : "=r" (result) : "Q" (*ptr) );
+#else
+    /* Prior to GCC 4.8, "Q" will be expanded to [rx, #0] which is not
+       accepted by assembler. So has to use following less efficient pattern.
+    */
+   __ASM volatile ("ldrbt %0, [%1]" : "=r" (result) : "r" (ptr) : "memory" );
+#endif
+   return ((uint8_t) result);    /* Add explicit type cast here */
+}
+
+
+/**
+  \brief   LDRT Unprivileged (16 bit)
+  \details Executes a Unprivileged LDRT instruction for 16 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint16_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint16_t __LDRHT(volatile uint16_t *ptr)
+{
+    uint32_t result;
+
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
+   __ASM volatile ("ldrht %0, %1" : "=r" (result) : "Q" (*ptr) );
+#else
+    /* Prior to GCC 4.8, "Q" will be expanded to [rx, #0] which is not
+       accepted by assembler. So has to use following less efficient pattern.
+    */
+   __ASM volatile ("ldrht %0, [%1]" : "=r" (result) : "r" (ptr) : "memory" );
+#endif
+   return ((uint16_t) result);    /* Add explicit type cast here */
+}
+
+
+/**
+  \brief   LDRT Unprivileged (32 bit)
+  \details Executes a Unprivileged LDRT instruction for 32 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint32_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint32_t __LDRT(volatile uint32_t *ptr)
+{
+    uint32_t result;
+
+   __ASM volatile ("ldrt %0, %1" : "=r" (result) : "Q" (*ptr) );
+   return(result);
+}
+
+
+/**
+  \brief   STRT Unprivileged (8 bit)
+  \details Executes a Unprivileged STRT instruction for 8 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+ */
+__STATIC_FORCEINLINE void __STRBT(uint8_t value, volatile uint8_t *ptr)
+{
+   __ASM volatile ("strbt %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) );
+}
+
+
+/**
+  \brief   STRT Unprivileged (16 bit)
+  \details Executes a Unprivileged STRT instruction for 16 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+ */
+__STATIC_FORCEINLINE void __STRHT(uint16_t value, volatile uint16_t *ptr)
+{
+   __ASM volatile ("strht %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) );
+}
+
+
+/**
+  \brief   STRT Unprivileged (32 bit)
+  \details Executes a Unprivileged STRT instruction for 32 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+ */
+__STATIC_FORCEINLINE void __STRT(uint32_t value, volatile uint32_t *ptr)
+{
+   __ASM volatile ("strt %1, %0" : "=Q" (*ptr) : "r" (value) );
+}
+
+#else  /* ((defined (__ARM_ARCH_7M__      ) && (__ARM_ARCH_7M__      == 1)) || \
+           (defined (__ARM_ARCH_7EM__     ) && (__ARM_ARCH_7EM__     == 1)) || \
+           (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1))    ) */
+
+/**
+  \brief   Signed Saturate
+  \details Saturates a signed value.
+  \param [in]  value  Value to be saturated
+  \param [in]    sat  Bit position to saturate to (1..32)
+  \return             Saturated value
+ */
+__STATIC_FORCEINLINE int32_t __SSAT(int32_t val, uint32_t sat)
+{
+  if ((sat >= 1U) && (sat <= 32U))
+  {
+    const int32_t max = (int32_t)((1U << (sat - 1U)) - 1U);
+    const int32_t min = -1 - max ;
+    if (val > max)
+    {
+      return max;
+    }
+    else if (val < min)
+    {
+      return min;
+    }
+  }
+  return val;
+}
+
+/**
+  \brief   Unsigned Saturate
+  \details Saturates an unsigned value.
+  \param [in]  value  Value to be saturated
+  \param [in]    sat  Bit position to saturate to (0..31)
+  \return             Saturated value
+ */
+__STATIC_FORCEINLINE uint32_t __USAT(int32_t val, uint32_t sat)
+{
+  if (sat <= 31U)
+  {
+    const uint32_t max = ((1U << sat) - 1U);
+    if (val > (int32_t)max)
+    {
+      return max;
+    }
+    else if (val < 0)
+    {
+      return 0U;
+    }
+  }
+  return (uint32_t)val;
+}
+
+#endif /* ((defined (__ARM_ARCH_7M__      ) && (__ARM_ARCH_7M__      == 1)) || \
+           (defined (__ARM_ARCH_7EM__     ) && (__ARM_ARCH_7EM__     == 1)) || \
+           (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1))    ) */
+
+
+#if ((defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \
+     (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1))    )
+/**
+  \brief   Load-Acquire (8 bit)
+  \details Executes a LDAB instruction for 8 bit value.
+  \param [in]    ptr  Pointer to data
+  \return             value of type uint8_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint8_t __LDAB(volatile uint8_t *ptr)
+{
+    uint32_t result;
+
+   __ASM volatile ("ldab %0, %1" : "=r" (result) : "Q" (*ptr) : "memory" );
+   return ((uint8_t) result);
+}
+
+
+/**
+  \brief   Load-Acquire (16 bit)
+  \details Executes a LDAH instruction for 16 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint16_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint16_t __LDAH(volatile uint16_t *ptr)
+{
+    uint32_t result;
+
+   __ASM volatile ("ldah %0, %1" : "=r" (result) : "Q" (*ptr) : "memory" );
+   return ((uint16_t) result);
+}
+
+
+/**
+  \brief   Load-Acquire (32 bit)
+  \details Executes a LDA instruction for 32 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint32_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint32_t __LDA(volatile uint32_t *ptr)
+{
+    uint32_t result;
+
+   __ASM volatile ("lda %0, %1" : "=r" (result) : "Q" (*ptr) : "memory" );
+   return(result);
+}
+
+
+/**
+  \brief   Store-Release (8 bit)
+  \details Executes a STLB instruction for 8 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+ */
+__STATIC_FORCEINLINE void __STLB(uint8_t value, volatile uint8_t *ptr)
+{
+   __ASM volatile ("stlb %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) : "memory" );
+}
+
+
+/**
+  \brief   Store-Release (16 bit)
+  \details Executes a STLH instruction for 16 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+ */
+__STATIC_FORCEINLINE void __STLH(uint16_t value, volatile uint16_t *ptr)
+{
+   __ASM volatile ("stlh %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) : "memory" );
+}
+
+
+/**
+  \brief   Store-Release (32 bit)
+  \details Executes a STL instruction for 32 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+ */
+__STATIC_FORCEINLINE void __STL(uint32_t value, volatile uint32_t *ptr)
+{
+   __ASM volatile ("stl %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) : "memory" );
+}
+
+
+/**
+  \brief   Load-Acquire Exclusive (8 bit)
+  \details Executes a LDAB exclusive instruction for 8 bit value.
+  \param [in]    ptr  Pointer to data
+  \return             value of type uint8_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint8_t __LDAEXB(volatile uint8_t *ptr)
+{
+    uint32_t result;
+
+   __ASM volatile ("ldaexb %0, %1" : "=r" (result) : "Q" (*ptr) : "memory" );
+   return ((uint8_t) result);
+}
+
+
+/**
+  \brief   Load-Acquire Exclusive (16 bit)
+  \details Executes a LDAH exclusive instruction for 16 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint16_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint16_t __LDAEXH(volatile uint16_t *ptr)
+{
+    uint32_t result;
+
+   __ASM volatile ("ldaexh %0, %1" : "=r" (result) : "Q" (*ptr) : "memory" );
+   return ((uint16_t) result);
+}
+
+
+/**
+  \brief   Load-Acquire Exclusive (32 bit)
+  \details Executes a LDA exclusive instruction for 32 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint32_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint32_t __LDAEX(volatile uint32_t *ptr)
+{
+    uint32_t result;
+
+   __ASM volatile ("ldaex %0, %1" : "=r" (result) : "Q" (*ptr) : "memory" );
+   return(result);
+}
+
+
+/**
+  \brief   Store-Release Exclusive (8 bit)
+  \details Executes a STLB exclusive instruction for 8 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+  \return          0  Function succeeded
+  \return          1  Function failed
+ */
+__STATIC_FORCEINLINE uint32_t __STLEXB(uint8_t value, volatile uint8_t *ptr)
+{
+   uint32_t result;
+
+   __ASM volatile ("stlexb %0, %2, %1" : "=&r" (result), "=Q" (*ptr) : "r" ((uint32_t)value) : "memory" );
+   return(result);
+}
+
+
+/**
+  \brief   Store-Release Exclusive (16 bit)
+  \details Executes a STLH exclusive instruction for 16 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+  \return          0  Function succeeded
+  \return          1  Function failed
+ */
+__STATIC_FORCEINLINE uint32_t __STLEXH(uint16_t value, volatile uint16_t *ptr)
+{
+   uint32_t result;
+
+   __ASM volatile ("stlexh %0, %2, %1" : "=&r" (result), "=Q" (*ptr) : "r" ((uint32_t)value) : "memory" );
+   return(result);
+}
+
+
+/**
+  \brief   Store-Release Exclusive (32 bit)
+  \details Executes a STL exclusive instruction for 32 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+  \return          0  Function succeeded
+  \return          1  Function failed
+ */
+__STATIC_FORCEINLINE uint32_t __STLEX(uint32_t value, volatile uint32_t *ptr)
+{
+   uint32_t result;
+
+   __ASM volatile ("stlex %0, %2, %1" : "=&r" (result), "=Q" (*ptr) : "r" ((uint32_t)value) : "memory" );
+   return(result);
+}
+
+#endif /* ((defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \
+           (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1))    ) */
+
+/*@}*/ /* end of group CMSIS_Core_InstructionInterface */
+
+
+/* ###################  Compiler specific Intrinsics  ########################### */
+/** \defgroup CMSIS_SIMD_intrinsics CMSIS SIMD Intrinsics
+  Access to dedicated SIMD instructions
+  @{
+*/
+
+#if (defined (__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1))
+
+__STATIC_FORCEINLINE uint32_t __SADD8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("sadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __QADD8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("qadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SHADD8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("shadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UADD8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("uadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UQADD8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("uqadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UHADD8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("uhadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+
+__STATIC_FORCEINLINE uint32_t __SSUB8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("ssub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __QSUB8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("qsub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SHSUB8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("shsub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __USUB8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("usub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UQSUB8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("uqsub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UHSUB8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("uhsub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+
+__STATIC_FORCEINLINE uint32_t __SADD16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("sadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __QADD16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("qadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SHADD16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("shadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UADD16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("uadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UQADD16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("uqadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UHADD16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("uhadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SSUB16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("ssub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __QSUB16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("qsub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SHSUB16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("shsub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __USUB16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("usub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UQSUB16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("uqsub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UHSUB16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("uhsub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SASX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("sasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __QASX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("qasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SHASX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("shasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UASX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("uasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UQASX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("uqasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UHASX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("uhasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SSAX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("ssax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __QSAX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("qsax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SHSAX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("shsax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __USAX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("usax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UQSAX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("uqsax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UHSAX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("uhsax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __USAD8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("usad8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __USADA8(uint32_t op1, uint32_t op2, uint32_t op3)
+{
+  uint32_t result;
+
+  __ASM ("usada8 %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) );
+  return(result);
+}
+
+#define __SSAT16(ARG1, ARG2) \
+({                          \
+  int32_t __RES, __ARG1 = (ARG1); \
+  __ASM volatile ("ssat16 %0, %1, %2" : "=r" (__RES) :  "I" (ARG2), "r" (__ARG1) : "cc" ); \
+  __RES; \
+ })
+
+#define __USAT16(ARG1, ARG2) \
+({                          \
+  uint32_t __RES, __ARG1 = (ARG1); \
+  __ASM volatile ("usat16 %0, %1, %2" : "=r" (__RES) :  "I" (ARG2), "r" (__ARG1) : "cc" ); \
+  __RES; \
+ })
+
+__STATIC_FORCEINLINE uint32_t __UXTB16(uint32_t op1)
+{
+  uint32_t result;
+
+  __ASM ("uxtb16 %0, %1" : "=r" (result) : "r" (op1));
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UXTAB16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("uxtab16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SXTB16(uint32_t op1)
+{
+  uint32_t result;
+
+  __ASM ("sxtb16 %0, %1" : "=r" (result) : "r" (op1));
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SXTB16_RORn(uint32_t op1, uint32_t rotate)
+{
+  uint32_t result;
+
+  __ASM ("sxtb16 %0, %1, ROR %2" : "=r" (result) : "r" (op1), "i" (rotate) );
+
+  return result;
+}
+
+__STATIC_FORCEINLINE uint32_t __SXTAB16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("sxtab16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SMUAD  (uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("smuad %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SMUADX (uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("smuadx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SMLAD (uint32_t op1, uint32_t op2, uint32_t op3)
+{
+  uint32_t result;
+
+  __ASM volatile ("smlad %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SMLADX (uint32_t op1, uint32_t op2, uint32_t op3)
+{
+  uint32_t result;
+
+  __ASM volatile ("smladx %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint64_t __SMLALD (uint32_t op1, uint32_t op2, uint64_t acc)
+{
+  union llreg_u{
+    uint32_t w32[2];
+    uint64_t w64;
+  } llr;
+  llr.w64 = acc;
+
+#ifndef __ARMEB__   /* Little endian */
+  __ASM volatile ("smlald %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) );
+#else               /* Big endian */
+  __ASM volatile ("smlald %0, %1, %2, %3" : "=r" (llr.w32[1]), "=r" (llr.w32[0]): "r" (op1), "r" (op2) , "0" (llr.w32[1]), "1" (llr.w32[0]) );
+#endif
+
+  return(llr.w64);
+}
+
+__STATIC_FORCEINLINE uint64_t __SMLALDX (uint32_t op1, uint32_t op2, uint64_t acc)
+{
+  union llreg_u{
+    uint32_t w32[2];
+    uint64_t w64;
+  } llr;
+  llr.w64 = acc;
+
+#ifndef __ARMEB__   /* Little endian */
+  __ASM volatile ("smlaldx %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) );
+#else               /* Big endian */
+  __ASM volatile ("smlaldx %0, %1, %2, %3" : "=r" (llr.w32[1]), "=r" (llr.w32[0]): "r" (op1), "r" (op2) , "0" (llr.w32[1]), "1" (llr.w32[0]) );
+#endif
+
+  return(llr.w64);
+}
+
+__STATIC_FORCEINLINE uint32_t __SMUSD  (uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("smusd %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SMUSDX (uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("smusdx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SMLSD (uint32_t op1, uint32_t op2, uint32_t op3)
+{
+  uint32_t result;
+
+  __ASM volatile ("smlsd %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SMLSDX (uint32_t op1, uint32_t op2, uint32_t op3)
+{
+  uint32_t result;
+
+  __ASM volatile ("smlsdx %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint64_t __SMLSLD (uint32_t op1, uint32_t op2, uint64_t acc)
+{
+  union llreg_u{
+    uint32_t w32[2];
+    uint64_t w64;
+  } llr;
+  llr.w64 = acc;
+
+#ifndef __ARMEB__   /* Little endian */
+  __ASM volatile ("smlsld %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) );
+#else               /* Big endian */
+  __ASM volatile ("smlsld %0, %1, %2, %3" : "=r" (llr.w32[1]), "=r" (llr.w32[0]): "r" (op1), "r" (op2) , "0" (llr.w32[1]), "1" (llr.w32[0]) );
+#endif
+
+  return(llr.w64);
+}
+
+__STATIC_FORCEINLINE uint64_t __SMLSLDX (uint32_t op1, uint32_t op2, uint64_t acc)
+{
+  union llreg_u{
+    uint32_t w32[2];
+    uint64_t w64;
+  } llr;
+  llr.w64 = acc;
+
+#ifndef __ARMEB__   /* Little endian */
+  __ASM volatile ("smlsldx %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) );
+#else               /* Big endian */
+  __ASM volatile ("smlsldx %0, %1, %2, %3" : "=r" (llr.w32[1]), "=r" (llr.w32[0]): "r" (op1), "r" (op2) , "0" (llr.w32[1]), "1" (llr.w32[0]) );
+#endif
+
+  return(llr.w64);
+}
+
+__STATIC_FORCEINLINE uint32_t __SEL  (uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("sel %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE  int32_t __QADD( int32_t op1,  int32_t op2)
+{
+  int32_t result;
+
+  __ASM volatile ("qadd %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE  int32_t __QSUB( int32_t op1,  int32_t op2)
+{
+  int32_t result;
+
+  __ASM volatile ("qsub %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+#if 0
+#define __PKHBT(ARG1,ARG2,ARG3) \
+({                          \
+  uint32_t __RES, __ARG1 = (ARG1), __ARG2 = (ARG2); \
+  __ASM ("pkhbt %0, %1, %2, lsl %3" : "=r" (__RES) :  "r" (__ARG1), "r" (__ARG2), "I" (ARG3)  ); \
+  __RES; \
+ })
+
+#define __PKHTB(ARG1,ARG2,ARG3) \
+({                          \
+  uint32_t __RES, __ARG1 = (ARG1), __ARG2 = (ARG2); \
+  if (ARG3 == 0) \
+    __ASM ("pkhtb %0, %1, %2" : "=r" (__RES) :  "r" (__ARG1), "r" (__ARG2)  ); \
+  else \
+    __ASM ("pkhtb %0, %1, %2, asr %3" : "=r" (__RES) :  "r" (__ARG1), "r" (__ARG2), "I" (ARG3)  ); \
+  __RES; \
+ })
+#endif
+
+#define __PKHBT(ARG1,ARG2,ARG3)          ( ((((uint32_t)(ARG1))          ) & 0x0000FFFFUL) |  \
+                                           ((((uint32_t)(ARG2)) << (ARG3)) & 0xFFFF0000UL)  )
+
+#define __PKHTB(ARG1,ARG2,ARG3)          ( ((((uint32_t)(ARG1))          ) & 0xFFFF0000UL) |  \
+                                           ((((uint32_t)(ARG2)) >> (ARG3)) & 0x0000FFFFUL)  )
+
+__STATIC_FORCEINLINE int32_t __SMMLA (int32_t op1, int32_t op2, int32_t op3)
+{
+ int32_t result;
+
+ __ASM ("smmla %0, %1, %2, %3" : "=r" (result): "r"  (op1), "r" (op2), "r" (op3) );
+ return(result);
+}
+
+#endif /* (__ARM_FEATURE_DSP == 1) */
+/*@} end of group CMSIS_SIMD_intrinsics */
+
+
+#pragma GCC diagnostic pop
+
+#endif /* __CMSIS_GCC_H */

+ 968 - 0
libraries/cmsis/cm4/core_support/cmsis_iccarm.h

@@ -0,0 +1,968 @@
+/******************************************************************************
+ * @file     cmsis_iccarm.h
+ * @brief    CMSIS compiler ICCARM (IAR Compiler for Arm) header file
+ * @version  V5.2.0
+ * @date     28. January 2020
+ ******************************************************************************/
+
+//------------------------------------------------------------------------------
+//
+// Copyright (c) 2017-2019 IAR Systems
+// Copyright (c) 2017-2019 Arm Limited. All rights reserved.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//------------------------------------------------------------------------------
+
+
+#ifndef __CMSIS_ICCARM_H__
+#define __CMSIS_ICCARM_H__
+
+#ifndef __ICCARM__
+  #error This file should only be compiled by ICCARM
+#endif
+
+#pragma system_include
+
+#define __IAR_FT _Pragma("inline=forced") __intrinsic
+
+#if (__VER__ >= 8000000)
+  #define __ICCARM_V8 1
+#else
+  #define __ICCARM_V8 0
+#endif
+
+#ifndef __ALIGNED
+  #if __ICCARM_V8
+    #define __ALIGNED(x) __attribute__((aligned(x)))
+  #elif (__VER__ >= 7080000)
+    /* Needs IAR language extensions */
+    #define __ALIGNED(x) __attribute__((aligned(x)))
+  #else
+    #warning No compiler specific solution for __ALIGNED.__ALIGNED is ignored.
+    #define __ALIGNED(x)
+  #endif
+#endif
+
+
+/* Define compiler macros for CPU architecture, used in CMSIS 5.
+ */
+#if __ARM_ARCH_6M__ || __ARM_ARCH_7M__ || __ARM_ARCH_7EM__ || __ARM_ARCH_8M_BASE__ || __ARM_ARCH_8M_MAIN__
+/* Macros already defined */
+#else
+  #if defined(__ARM8M_MAINLINE__) || defined(__ARM8EM_MAINLINE__)
+    #define __ARM_ARCH_8M_MAIN__ 1
+  #elif defined(__ARM8M_BASELINE__)
+    #define __ARM_ARCH_8M_BASE__ 1
+  #elif defined(__ARM_ARCH_PROFILE) && __ARM_ARCH_PROFILE == 'M'
+    #if __ARM_ARCH == 6
+      #define __ARM_ARCH_6M__ 1
+    #elif __ARM_ARCH == 7
+      #if __ARM_FEATURE_DSP
+        #define __ARM_ARCH_7EM__ 1
+      #else
+        #define __ARM_ARCH_7M__ 1
+      #endif
+    #endif /* __ARM_ARCH */
+  #endif /* __ARM_ARCH_PROFILE == 'M' */
+#endif
+
+/* Alternativ core deduction for older ICCARM's */
+#if !defined(__ARM_ARCH_6M__) && !defined(__ARM_ARCH_7M__) && !defined(__ARM_ARCH_7EM__) && \
+    !defined(__ARM_ARCH_8M_BASE__) && !defined(__ARM_ARCH_8M_MAIN__)
+  #if defined(__ARM6M__) && (__CORE__ == __ARM6M__)
+    #define __ARM_ARCH_6M__ 1
+  #elif defined(__ARM7M__) && (__CORE__ == __ARM7M__)
+    #define __ARM_ARCH_7M__ 1
+  #elif defined(__ARM7EM__) && (__CORE__ == __ARM7EM__)
+    #define __ARM_ARCH_7EM__  1
+  #elif defined(__ARM8M_BASELINE__) && (__CORE == __ARM8M_BASELINE__)
+    #define __ARM_ARCH_8M_BASE__ 1
+  #elif defined(__ARM8M_MAINLINE__) && (__CORE == __ARM8M_MAINLINE__)
+    #define __ARM_ARCH_8M_MAIN__ 1
+  #elif defined(__ARM8EM_MAINLINE__) && (__CORE == __ARM8EM_MAINLINE__)
+    #define __ARM_ARCH_8M_MAIN__ 1
+  #else
+    #error "Unknown target."
+  #endif
+#endif
+
+
+
+#if defined(__ARM_ARCH_6M__) && __ARM_ARCH_6M__==1
+  #define __IAR_M0_FAMILY  1
+#elif defined(__ARM_ARCH_8M_BASE__) && __ARM_ARCH_8M_BASE__==1
+  #define __IAR_M0_FAMILY  1
+#else
+  #define __IAR_M0_FAMILY  0
+#endif
+
+
+#ifndef __ASM
+  #define __ASM __asm
+#endif
+
+#ifndef   __COMPILER_BARRIER
+  #define __COMPILER_BARRIER() __ASM volatile("":::"memory")
+#endif
+
+#ifndef __INLINE
+  #define __INLINE inline
+#endif
+
+#ifndef   __NO_RETURN
+  #if __ICCARM_V8
+    #define __NO_RETURN __attribute__((__noreturn__))
+  #else
+    #define __NO_RETURN _Pragma("object_attribute=__noreturn")
+  #endif
+#endif
+
+#ifndef   __PACKED
+  #if __ICCARM_V8
+    #define __PACKED __attribute__((packed, aligned(1)))
+  #else
+    /* Needs IAR language extensions */
+    #define __PACKED __packed
+  #endif
+#endif
+
+#ifndef   __PACKED_STRUCT
+  #if __ICCARM_V8
+    #define __PACKED_STRUCT struct __attribute__((packed, aligned(1)))
+  #else
+    /* Needs IAR language extensions */
+    #define __PACKED_STRUCT __packed struct
+  #endif
+#endif
+
+#ifndef   __PACKED_UNION
+  #if __ICCARM_V8
+    #define __PACKED_UNION union __attribute__((packed, aligned(1)))
+  #else
+    /* Needs IAR language extensions */
+    #define __PACKED_UNION __packed union
+  #endif
+#endif
+
+#ifndef   __RESTRICT
+  #if __ICCARM_V8
+    #define __RESTRICT            __restrict
+  #else
+    /* Needs IAR language extensions */
+    #define __RESTRICT            restrict
+  #endif
+#endif
+
+#ifndef   __STATIC_INLINE
+  #define __STATIC_INLINE       static inline
+#endif
+
+#ifndef   __FORCEINLINE
+  #define __FORCEINLINE         _Pragma("inline=forced")
+#endif
+
+#ifndef   __STATIC_FORCEINLINE
+  #define __STATIC_FORCEINLINE  __FORCEINLINE __STATIC_INLINE
+#endif
+
+#ifndef __UNALIGNED_UINT16_READ
+#pragma language=save
+#pragma language=extended
+__IAR_FT uint16_t __iar_uint16_read(void const *ptr)
+{
+  return *(__packed uint16_t*)(ptr);
+}
+#pragma language=restore
+#define __UNALIGNED_UINT16_READ(PTR) __iar_uint16_read(PTR)
+#endif
+
+
+#ifndef __UNALIGNED_UINT16_WRITE
+#pragma language=save
+#pragma language=extended
+__IAR_FT void __iar_uint16_write(void const *ptr, uint16_t val)
+{
+  *(__packed uint16_t*)(ptr) = val;;
+}
+#pragma language=restore
+#define __UNALIGNED_UINT16_WRITE(PTR,VAL) __iar_uint16_write(PTR,VAL)
+#endif
+
+#ifndef __UNALIGNED_UINT32_READ
+#pragma language=save
+#pragma language=extended
+__IAR_FT uint32_t __iar_uint32_read(void const *ptr)
+{
+  return *(__packed uint32_t*)(ptr);
+}
+#pragma language=restore
+#define __UNALIGNED_UINT32_READ(PTR) __iar_uint32_read(PTR)
+#endif
+
+#ifndef __UNALIGNED_UINT32_WRITE
+#pragma language=save
+#pragma language=extended
+__IAR_FT void __iar_uint32_write(void const *ptr, uint32_t val)
+{
+  *(__packed uint32_t*)(ptr) = val;;
+}
+#pragma language=restore
+#define __UNALIGNED_UINT32_WRITE(PTR,VAL) __iar_uint32_write(PTR,VAL)
+#endif
+
+#ifndef __UNALIGNED_UINT32   /* deprecated */
+#pragma language=save
+#pragma language=extended
+__packed struct  __iar_u32 { uint32_t v; };
+#pragma language=restore
+#define __UNALIGNED_UINT32(PTR) (((struct __iar_u32 *)(PTR))->v)
+#endif
+
+#ifndef   __USED
+  #if __ICCARM_V8
+    #define __USED __attribute__((used))
+  #else
+    #define __USED _Pragma("__root")
+  #endif
+#endif
+
+#ifndef   __WEAK
+  #if __ICCARM_V8
+    #define __WEAK __attribute__((weak))
+  #else
+    #define __WEAK _Pragma("__weak")
+  #endif
+#endif
+
+#ifndef __PROGRAM_START
+#define __PROGRAM_START           __iar_program_start
+#endif
+
+#ifndef __INITIAL_SP
+#define __INITIAL_SP              CSTACK$$Limit
+#endif
+
+#ifndef __STACK_LIMIT
+#define __STACK_LIMIT             CSTACK$$Base
+#endif
+
+#ifndef __VECTOR_TABLE
+#define __VECTOR_TABLE            __vector_table
+#endif
+
+#ifndef __VECTOR_TABLE_ATTRIBUTE
+#define __VECTOR_TABLE_ATTRIBUTE  @".intvec"
+#endif
+
+#ifndef __ICCARM_INTRINSICS_VERSION__
+  #define __ICCARM_INTRINSICS_VERSION__  0
+#endif
+
+#if __ICCARM_INTRINSICS_VERSION__ == 2
+
+  #if defined(__CLZ)
+    #undef __CLZ
+  #endif
+  #if defined(__REVSH)
+    #undef __REVSH
+  #endif
+  #if defined(__RBIT)
+    #undef __RBIT
+  #endif
+  #if defined(__SSAT)
+    #undef __SSAT
+  #endif
+  #if defined(__USAT)
+    #undef __USAT
+  #endif
+
+  #include "iccarm_builtin.h"
+
+  #define __disable_fault_irq __iar_builtin_disable_fiq
+  #define __disable_irq       __iar_builtin_disable_interrupt
+  #define __enable_fault_irq  __iar_builtin_enable_fiq
+  #define __enable_irq        __iar_builtin_enable_interrupt
+  #define __arm_rsr           __iar_builtin_rsr
+  #define __arm_wsr           __iar_builtin_wsr
+
+
+  #define __get_APSR()                (__arm_rsr("APSR"))
+  #define __get_BASEPRI()             (__arm_rsr("BASEPRI"))
+  #define __get_CONTROL()             (__arm_rsr("CONTROL"))
+  #define __get_FAULTMASK()           (__arm_rsr("FAULTMASK"))
+
+  #if ((defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)) && \
+       (defined (__FPU_USED   ) && (__FPU_USED    == 1U))     )
+    #define __get_FPSCR()             (__arm_rsr("FPSCR"))
+    #define __set_FPSCR(VALUE)        (__arm_wsr("FPSCR", (VALUE)))
+  #else
+    #define __get_FPSCR()             ( 0 )
+    #define __set_FPSCR(VALUE)        ((void)VALUE)
+  #endif
+
+  #define __get_IPSR()                (__arm_rsr("IPSR"))
+  #define __get_MSP()                 (__arm_rsr("MSP"))
+  #if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) && \
+       (!defined (__ARM_FEATURE_CMSE) || (__ARM_FEATURE_CMSE < 3)))
+    // without main extensions, the non-secure MSPLIM is RAZ/WI
+    #define __get_MSPLIM()            (0U)
+  #else
+    #define __get_MSPLIM()            (__arm_rsr("MSPLIM"))
+  #endif
+  #define __get_PRIMASK()             (__arm_rsr("PRIMASK"))
+  #define __get_PSP()                 (__arm_rsr("PSP"))
+
+  #if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) && \
+       (!defined (__ARM_FEATURE_CMSE) || (__ARM_FEATURE_CMSE < 3)))
+    // without main extensions, the non-secure PSPLIM is RAZ/WI
+    #define __get_PSPLIM()            (0U)
+  #else
+    #define __get_PSPLIM()            (__arm_rsr("PSPLIM"))
+  #endif
+
+  #define __get_xPSR()                (__arm_rsr("xPSR"))
+
+  #define __set_BASEPRI(VALUE)        (__arm_wsr("BASEPRI", (VALUE)))
+  #define __set_BASEPRI_MAX(VALUE)    (__arm_wsr("BASEPRI_MAX", (VALUE)))
+  #define __set_CONTROL(VALUE)        (__arm_wsr("CONTROL", (VALUE)))
+  #define __set_FAULTMASK(VALUE)      (__arm_wsr("FAULTMASK", (VALUE)))
+  #define __set_MSP(VALUE)            (__arm_wsr("MSP", (VALUE)))
+
+  #if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) && \
+       (!defined (__ARM_FEATURE_CMSE) || (__ARM_FEATURE_CMSE < 3)))
+    // without main extensions, the non-secure MSPLIM is RAZ/WI
+    #define __set_MSPLIM(VALUE)       ((void)(VALUE))
+  #else
+    #define __set_MSPLIM(VALUE)       (__arm_wsr("MSPLIM", (VALUE)))
+  #endif
+  #define __set_PRIMASK(VALUE)        (__arm_wsr("PRIMASK", (VALUE)))
+  #define __set_PSP(VALUE)            (__arm_wsr("PSP", (VALUE)))
+  #if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) && \
+       (!defined (__ARM_FEATURE_CMSE) || (__ARM_FEATURE_CMSE < 3)))
+    // without main extensions, the non-secure PSPLIM is RAZ/WI
+    #define __set_PSPLIM(VALUE)       ((void)(VALUE))
+  #else
+    #define __set_PSPLIM(VALUE)       (__arm_wsr("PSPLIM", (VALUE)))
+  #endif
+
+  #define __TZ_get_CONTROL_NS()       (__arm_rsr("CONTROL_NS"))
+  #define __TZ_set_CONTROL_NS(VALUE)  (__arm_wsr("CONTROL_NS", (VALUE)))
+  #define __TZ_get_PSP_NS()           (__arm_rsr("PSP_NS"))
+  #define __TZ_set_PSP_NS(VALUE)      (__arm_wsr("PSP_NS", (VALUE)))
+  #define __TZ_get_MSP_NS()           (__arm_rsr("MSP_NS"))
+  #define __TZ_set_MSP_NS(VALUE)      (__arm_wsr("MSP_NS", (VALUE)))
+  #define __TZ_get_SP_NS()            (__arm_rsr("SP_NS"))
+  #define __TZ_set_SP_NS(VALUE)       (__arm_wsr("SP_NS", (VALUE)))
+  #define __TZ_get_PRIMASK_NS()       (__arm_rsr("PRIMASK_NS"))
+  #define __TZ_set_PRIMASK_NS(VALUE)  (__arm_wsr("PRIMASK_NS", (VALUE)))
+  #define __TZ_get_BASEPRI_NS()       (__arm_rsr("BASEPRI_NS"))
+  #define __TZ_set_BASEPRI_NS(VALUE)  (__arm_wsr("BASEPRI_NS", (VALUE)))
+  #define __TZ_get_FAULTMASK_NS()     (__arm_rsr("FAULTMASK_NS"))
+  #define __TZ_set_FAULTMASK_NS(VALUE)(__arm_wsr("FAULTMASK_NS", (VALUE)))
+
+  #if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) && \
+       (!defined (__ARM_FEATURE_CMSE) || (__ARM_FEATURE_CMSE < 3)))
+    // without main extensions, the non-secure PSPLIM is RAZ/WI
+    #define __TZ_get_PSPLIM_NS()      (0U)
+    #define __TZ_set_PSPLIM_NS(VALUE) ((void)(VALUE))
+  #else
+    #define __TZ_get_PSPLIM_NS()      (__arm_rsr("PSPLIM_NS"))
+    #define __TZ_set_PSPLIM_NS(VALUE) (__arm_wsr("PSPLIM_NS", (VALUE)))
+  #endif
+
+  #define __TZ_get_MSPLIM_NS()        (__arm_rsr("MSPLIM_NS"))
+  #define __TZ_set_MSPLIM_NS(VALUE)   (__arm_wsr("MSPLIM_NS", (VALUE)))
+
+  #define __NOP     __iar_builtin_no_operation
+
+  #define __CLZ     __iar_builtin_CLZ
+  #define __CLREX   __iar_builtin_CLREX
+
+  #define __DMB     __iar_builtin_DMB
+  #define __DSB     __iar_builtin_DSB
+  #define __ISB     __iar_builtin_ISB
+
+  #define __LDREXB  __iar_builtin_LDREXB
+  #define __LDREXH  __iar_builtin_LDREXH
+  #define __LDREXW  __iar_builtin_LDREX
+
+  #define __RBIT    __iar_builtin_RBIT
+  #define __REV     __iar_builtin_REV
+  #define __REV16   __iar_builtin_REV16
+
+  __IAR_FT int16_t __REVSH(int16_t val)
+  {
+    return (int16_t) __iar_builtin_REVSH(val);
+  }
+
+  #define __ROR     __iar_builtin_ROR
+  #define __RRX     __iar_builtin_RRX
+
+  #define __SEV     __iar_builtin_SEV
+
+  #if !__IAR_M0_FAMILY
+    #define __SSAT    __iar_builtin_SSAT
+  #endif
+
+  #define __STREXB  __iar_builtin_STREXB
+  #define __STREXH  __iar_builtin_STREXH
+  #define __STREXW  __iar_builtin_STREX
+
+  #if !__IAR_M0_FAMILY
+    #define __USAT    __iar_builtin_USAT
+  #endif
+
+  #define __WFE     __iar_builtin_WFE
+  #define __WFI     __iar_builtin_WFI
+
+  #if __ARM_MEDIA__
+    #define __SADD8   __iar_builtin_SADD8
+    #define __QADD8   __iar_builtin_QADD8
+    #define __SHADD8  __iar_builtin_SHADD8
+    #define __UADD8   __iar_builtin_UADD8
+    #define __UQADD8  __iar_builtin_UQADD8
+    #define __UHADD8  __iar_builtin_UHADD8
+    #define __SSUB8   __iar_builtin_SSUB8
+    #define __QSUB8   __iar_builtin_QSUB8
+    #define __SHSUB8  __iar_builtin_SHSUB8
+    #define __USUB8   __iar_builtin_USUB8
+    #define __UQSUB8  __iar_builtin_UQSUB8
+    #define __UHSUB8  __iar_builtin_UHSUB8
+    #define __SADD16  __iar_builtin_SADD16
+    #define __QADD16  __iar_builtin_QADD16
+    #define __SHADD16 __iar_builtin_SHADD16
+    #define __UADD16  __iar_builtin_UADD16
+    #define __UQADD16 __iar_builtin_UQADD16
+    #define __UHADD16 __iar_builtin_UHADD16
+    #define __SSUB16  __iar_builtin_SSUB16
+    #define __QSUB16  __iar_builtin_QSUB16
+    #define __SHSUB16 __iar_builtin_SHSUB16
+    #define __USUB16  __iar_builtin_USUB16
+    #define __UQSUB16 __iar_builtin_UQSUB16
+    #define __UHSUB16 __iar_builtin_UHSUB16
+    #define __SASX    __iar_builtin_SASX
+    #define __QASX    __iar_builtin_QASX
+    #define __SHASX   __iar_builtin_SHASX
+    #define __UASX    __iar_builtin_UASX
+    #define __UQASX   __iar_builtin_UQASX
+    #define __UHASX   __iar_builtin_UHASX
+    #define __SSAX    __iar_builtin_SSAX
+    #define __QSAX    __iar_builtin_QSAX
+    #define __SHSAX   __iar_builtin_SHSAX
+    #define __USAX    __iar_builtin_USAX
+    #define __UQSAX   __iar_builtin_UQSAX
+    #define __UHSAX   __iar_builtin_UHSAX
+    #define __USAD8   __iar_builtin_USAD8
+    #define __USADA8  __iar_builtin_USADA8
+    #define __SSAT16  __iar_builtin_SSAT16
+    #define __USAT16  __iar_builtin_USAT16
+    #define __UXTB16  __iar_builtin_UXTB16
+    #define __UXTAB16 __iar_builtin_UXTAB16
+    #define __SXTB16  __iar_builtin_SXTB16
+    #define __SXTAB16 __iar_builtin_SXTAB16
+    #define __SMUAD   __iar_builtin_SMUAD
+    #define __SMUADX  __iar_builtin_SMUADX
+    #define __SMMLA   __iar_builtin_SMMLA
+    #define __SMLAD   __iar_builtin_SMLAD
+    #define __SMLADX  __iar_builtin_SMLADX
+    #define __SMLALD  __iar_builtin_SMLALD
+    #define __SMLALDX __iar_builtin_SMLALDX
+    #define __SMUSD   __iar_builtin_SMUSD
+    #define __SMUSDX  __iar_builtin_SMUSDX
+    #define __SMLSD   __iar_builtin_SMLSD
+    #define __SMLSDX  __iar_builtin_SMLSDX
+    #define __SMLSLD  __iar_builtin_SMLSLD
+    #define __SMLSLDX __iar_builtin_SMLSLDX
+    #define __SEL     __iar_builtin_SEL
+    #define __QADD    __iar_builtin_QADD
+    #define __QSUB    __iar_builtin_QSUB
+    #define __PKHBT   __iar_builtin_PKHBT
+    #define __PKHTB   __iar_builtin_PKHTB
+  #endif
+
+#else /* __ICCARM_INTRINSICS_VERSION__ == 2 */
+
+  #if __IAR_M0_FAMILY
+   /* Avoid clash between intrinsics.h and arm_math.h when compiling for Cortex-M0. */
+    #define __CLZ  __cmsis_iar_clz_not_active
+    #define __SSAT __cmsis_iar_ssat_not_active
+    #define __USAT __cmsis_iar_usat_not_active
+    #define __RBIT __cmsis_iar_rbit_not_active
+    #define __get_APSR  __cmsis_iar_get_APSR_not_active
+  #endif
+
+
+  #if (!((defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)) && \
+         (defined (__FPU_USED   ) && (__FPU_USED    == 1U))     ))
+    #define __get_FPSCR __cmsis_iar_get_FPSR_not_active
+    #define __set_FPSCR __cmsis_iar_set_FPSR_not_active
+  #endif
+
+  #ifdef __INTRINSICS_INCLUDED
+  #error intrinsics.h is already included previously!
+  #endif
+
+  #include <intrinsics.h>
+
+  #if __IAR_M0_FAMILY
+   /* Avoid clash between intrinsics.h and arm_math.h when compiling for Cortex-M0. */
+    #undef __CLZ
+    #undef __SSAT
+    #undef __USAT
+    #undef __RBIT
+    #undef __get_APSR
+
+    __STATIC_INLINE uint8_t __CLZ(uint32_t data)
+    {
+      if (data == 0U) { return 32U; }
+
+      uint32_t count = 0U;
+      uint32_t mask = 0x80000000U;
+
+      while ((data & mask) == 0U)
+      {
+        count += 1U;
+        mask = mask >> 1U;
+      }
+      return count;
+    }
+
+    __STATIC_INLINE uint32_t __RBIT(uint32_t v)
+    {
+      uint8_t sc = 31U;
+      uint32_t r = v;
+      for (v >>= 1U; v; v >>= 1U)
+      {
+        r <<= 1U;
+        r |= v & 1U;
+        sc--;
+      }
+      return (r << sc);
+    }
+
+    __STATIC_INLINE  uint32_t __get_APSR(void)
+    {
+      uint32_t res;
+      __asm("MRS      %0,APSR" : "=r" (res));
+      return res;
+    }
+
+  #endif
+
+  #if (!((defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)) && \
+         (defined (__FPU_USED   ) && (__FPU_USED    == 1U))     ))
+    #undef __get_FPSCR
+    #undef __set_FPSCR
+    #define __get_FPSCR()       (0)
+    #define __set_FPSCR(VALUE)  ((void)VALUE)
+  #endif
+
+  #pragma diag_suppress=Pe940
+  #pragma diag_suppress=Pe177
+
+  #define __enable_irq    __enable_interrupt
+  #define __disable_irq   __disable_interrupt
+  #define __NOP           __no_operation
+
+  #define __get_xPSR      __get_PSR
+
+  #if (!defined(__ARM_ARCH_6M__) || __ARM_ARCH_6M__==0)
+
+    __IAR_FT uint32_t __LDREXW(uint32_t volatile *ptr)
+    {
+      return __LDREX((unsigned long *)ptr);
+    }
+
+    __IAR_FT uint32_t __STREXW(uint32_t value, uint32_t volatile *ptr)
+    {
+      return __STREX(value, (unsigned long *)ptr);
+    }
+  #endif
+
+
+  /* __CORTEX_M is defined in core_cm0.h, core_cm3.h and core_cm4.h. */
+  #if (__CORTEX_M >= 0x03)
+
+    __IAR_FT uint32_t __RRX(uint32_t value)
+    {
+      uint32_t result;
+      __ASM volatile("RRX      %0, %1" : "=r"(result) : "r" (value));
+      return(result);
+    }
+
+    __IAR_FT void __set_BASEPRI_MAX(uint32_t value)
+    {
+      __asm volatile("MSR      BASEPRI_MAX,%0"::"r" (value));
+    }
+
+
+    #define __enable_fault_irq  __enable_fiq
+    #define __disable_fault_irq __disable_fiq
+
+
+  #endif /* (__CORTEX_M >= 0x03) */
+
+  __IAR_FT uint32_t __ROR(uint32_t op1, uint32_t op2)
+  {
+    return (op1 >> op2) | (op1 << ((sizeof(op1)*8)-op2));
+  }
+
+  #if ((defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \
+       (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1))    )
+
+   __IAR_FT uint32_t __get_MSPLIM(void)
+    {
+      uint32_t res;
+    #if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) && \
+         (!defined (__ARM_FEATURE_CMSE  ) || (__ARM_FEATURE_CMSE   < 3)))
+      // without main extensions, the non-secure MSPLIM is RAZ/WI
+      res = 0U;
+    #else
+      __asm volatile("MRS      %0,MSPLIM" : "=r" (res));
+    #endif
+      return res;
+    }
+
+    __IAR_FT void   __set_MSPLIM(uint32_t value)
+    {
+    #if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) && \
+         (!defined (__ARM_FEATURE_CMSE  ) || (__ARM_FEATURE_CMSE   < 3)))
+      // without main extensions, the non-secure MSPLIM is RAZ/WI
+      (void)value;
+    #else
+      __asm volatile("MSR      MSPLIM,%0" :: "r" (value));
+    #endif
+    }
+
+    __IAR_FT uint32_t __get_PSPLIM(void)
+    {
+      uint32_t res;
+    #if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) && \
+         (!defined (__ARM_FEATURE_CMSE  ) || (__ARM_FEATURE_CMSE   < 3)))
+      // without main extensions, the non-secure PSPLIM is RAZ/WI
+      res = 0U;
+    #else
+      __asm volatile("MRS      %0,PSPLIM" : "=r" (res));
+    #endif
+      return res;
+    }
+
+    __IAR_FT void   __set_PSPLIM(uint32_t value)
+    {
+    #if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) && \
+         (!defined (__ARM_FEATURE_CMSE  ) || (__ARM_FEATURE_CMSE   < 3)))
+      // without main extensions, the non-secure PSPLIM is RAZ/WI
+      (void)value;
+    #else
+      __asm volatile("MSR      PSPLIM,%0" :: "r" (value));
+    #endif
+    }
+
+    __IAR_FT uint32_t __TZ_get_CONTROL_NS(void)
+    {
+      uint32_t res;
+      __asm volatile("MRS      %0,CONTROL_NS" : "=r" (res));
+      return res;
+    }
+
+    __IAR_FT void   __TZ_set_CONTROL_NS(uint32_t value)
+    {
+      __asm volatile("MSR      CONTROL_NS,%0" :: "r" (value));
+    }
+
+    __IAR_FT uint32_t   __TZ_get_PSP_NS(void)
+    {
+      uint32_t res;
+      __asm volatile("MRS      %0,PSP_NS" : "=r" (res));
+      return res;
+    }
+
+    __IAR_FT void   __TZ_set_PSP_NS(uint32_t value)
+    {
+      __asm volatile("MSR      PSP_NS,%0" :: "r" (value));
+    }
+
+    __IAR_FT uint32_t   __TZ_get_MSP_NS(void)
+    {
+      uint32_t res;
+      __asm volatile("MRS      %0,MSP_NS" : "=r" (res));
+      return res;
+    }
+
+    __IAR_FT void   __TZ_set_MSP_NS(uint32_t value)
+    {
+      __asm volatile("MSR      MSP_NS,%0" :: "r" (value));
+    }
+
+    __IAR_FT uint32_t   __TZ_get_SP_NS(void)
+    {
+      uint32_t res;
+      __asm volatile("MRS      %0,SP_NS" : "=r" (res));
+      return res;
+    }
+    __IAR_FT void   __TZ_set_SP_NS(uint32_t value)
+    {
+      __asm volatile("MSR      SP_NS,%0" :: "r" (value));
+    }
+
+    __IAR_FT uint32_t   __TZ_get_PRIMASK_NS(void)
+    {
+      uint32_t res;
+      __asm volatile("MRS      %0,PRIMASK_NS" : "=r" (res));
+      return res;
+    }
+
+    __IAR_FT void   __TZ_set_PRIMASK_NS(uint32_t value)
+    {
+      __asm volatile("MSR      PRIMASK_NS,%0" :: "r" (value));
+    }
+
+    __IAR_FT uint32_t   __TZ_get_BASEPRI_NS(void)
+    {
+      uint32_t res;
+      __asm volatile("MRS      %0,BASEPRI_NS" : "=r" (res));
+      return res;
+    }
+
+    __IAR_FT void   __TZ_set_BASEPRI_NS(uint32_t value)
+    {
+      __asm volatile("MSR      BASEPRI_NS,%0" :: "r" (value));
+    }
+
+    __IAR_FT uint32_t   __TZ_get_FAULTMASK_NS(void)
+    {
+      uint32_t res;
+      __asm volatile("MRS      %0,FAULTMASK_NS" : "=r" (res));
+      return res;
+    }
+
+    __IAR_FT void   __TZ_set_FAULTMASK_NS(uint32_t value)
+    {
+      __asm volatile("MSR      FAULTMASK_NS,%0" :: "r" (value));
+    }
+
+    __IAR_FT uint32_t   __TZ_get_PSPLIM_NS(void)
+    {
+      uint32_t res;
+    #if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) && \
+         (!defined (__ARM_FEATURE_CMSE  ) || (__ARM_FEATURE_CMSE   < 3)))
+      // without main extensions, the non-secure PSPLIM is RAZ/WI
+      res = 0U;
+    #else
+      __asm volatile("MRS      %0,PSPLIM_NS" : "=r" (res));
+    #endif
+      return res;
+    }
+
+    __IAR_FT void   __TZ_set_PSPLIM_NS(uint32_t value)
+    {
+    #if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) && \
+         (!defined (__ARM_FEATURE_CMSE  ) || (__ARM_FEATURE_CMSE   < 3)))
+      // without main extensions, the non-secure PSPLIM is RAZ/WI
+      (void)value;
+    #else
+      __asm volatile("MSR      PSPLIM_NS,%0" :: "r" (value));
+    #endif
+    }
+
+    __IAR_FT uint32_t   __TZ_get_MSPLIM_NS(void)
+    {
+      uint32_t res;
+      __asm volatile("MRS      %0,MSPLIM_NS" : "=r" (res));
+      return res;
+    }
+
+    __IAR_FT void   __TZ_set_MSPLIM_NS(uint32_t value)
+    {
+      __asm volatile("MSR      MSPLIM_NS,%0" :: "r" (value));
+    }
+
+  #endif /* __ARM_ARCH_8M_MAIN__ or __ARM_ARCH_8M_BASE__ */
+
+#endif   /* __ICCARM_INTRINSICS_VERSION__ == 2 */
+
+#define __BKPT(value)    __asm volatile ("BKPT     %0" : : "i"(value))
+
+#if __IAR_M0_FAMILY
+  __STATIC_INLINE int32_t __SSAT(int32_t val, uint32_t sat)
+  {
+    if ((sat >= 1U) && (sat <= 32U))
+    {
+      const int32_t max = (int32_t)((1U << (sat - 1U)) - 1U);
+      const int32_t min = -1 - max ;
+      if (val > max)
+      {
+        return max;
+      }
+      else if (val < min)
+      {
+        return min;
+      }
+    }
+    return val;
+  }
+
+  __STATIC_INLINE uint32_t __USAT(int32_t val, uint32_t sat)
+  {
+    if (sat <= 31U)
+    {
+      const uint32_t max = ((1U << sat) - 1U);
+      if (val > (int32_t)max)
+      {
+        return max;
+      }
+      else if (val < 0)
+      {
+        return 0U;
+      }
+    }
+    return (uint32_t)val;
+  }
+#endif
+
+#if (__CORTEX_M >= 0x03)   /* __CORTEX_M is defined in core_cm0.h, core_cm3.h and core_cm4.h. */
+
+  __IAR_FT uint8_t __LDRBT(volatile uint8_t *addr)
+  {
+    uint32_t res;
+    __ASM volatile ("LDRBT %0, [%1]" : "=r" (res) : "r" (addr) : "memory");
+    return ((uint8_t)res);
+  }
+
+  __IAR_FT uint16_t __LDRHT(volatile uint16_t *addr)
+  {
+    uint32_t res;
+    __ASM volatile ("LDRHT %0, [%1]" : "=r" (res) : "r" (addr) : "memory");
+    return ((uint16_t)res);
+  }
+
+  __IAR_FT uint32_t __LDRT(volatile uint32_t *addr)
+  {
+    uint32_t res;
+    __ASM volatile ("LDRT %0, [%1]" : "=r" (res) : "r" (addr) : "memory");
+    return res;
+  }
+
+  __IAR_FT void __STRBT(uint8_t value, volatile uint8_t *addr)
+  {
+    __ASM volatile ("STRBT %1, [%0]" : : "r" (addr), "r" ((uint32_t)value) : "memory");
+  }
+
+  __IAR_FT void __STRHT(uint16_t value, volatile uint16_t *addr)
+  {
+    __ASM volatile ("STRHT %1, [%0]" : : "r" (addr), "r" ((uint32_t)value) : "memory");
+  }
+
+  __IAR_FT void __STRT(uint32_t value, volatile uint32_t *addr)
+  {
+    __ASM volatile ("STRT %1, [%0]" : : "r" (addr), "r" (value) : "memory");
+  }
+
+#endif /* (__CORTEX_M >= 0x03) */
+
+#if ((defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \
+     (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1))    )
+
+
+  __IAR_FT uint8_t __LDAB(volatile uint8_t *ptr)
+  {
+    uint32_t res;
+    __ASM volatile ("LDAB %0, [%1]" : "=r" (res) : "r" (ptr) : "memory");
+    return ((uint8_t)res);
+  }
+
+  __IAR_FT uint16_t __LDAH(volatile uint16_t *ptr)
+  {
+    uint32_t res;
+    __ASM volatile ("LDAH %0, [%1]" : "=r" (res) : "r" (ptr) : "memory");
+    return ((uint16_t)res);
+  }
+
+  __IAR_FT uint32_t __LDA(volatile uint32_t *ptr)
+  {
+    uint32_t res;
+    __ASM volatile ("LDA %0, [%1]" : "=r" (res) : "r" (ptr) : "memory");
+    return res;
+  }
+
+  __IAR_FT void __STLB(uint8_t value, volatile uint8_t *ptr)
+  {
+    __ASM volatile ("STLB %1, [%0]" :: "r" (ptr), "r" (value) : "memory");
+  }
+
+  __IAR_FT void __STLH(uint16_t value, volatile uint16_t *ptr)
+  {
+    __ASM volatile ("STLH %1, [%0]" :: "r" (ptr), "r" (value) : "memory");
+  }
+
+  __IAR_FT void __STL(uint32_t value, volatile uint32_t *ptr)
+  {
+    __ASM volatile ("STL %1, [%0]" :: "r" (ptr), "r" (value) : "memory");
+  }
+
+  __IAR_FT uint8_t __LDAEXB(volatile uint8_t *ptr)
+  {
+    uint32_t res;
+    __ASM volatile ("LDAEXB %0, [%1]" : "=r" (res) : "r" (ptr) : "memory");
+    return ((uint8_t)res);
+  }
+
+  __IAR_FT uint16_t __LDAEXH(volatile uint16_t *ptr)
+  {
+    uint32_t res;
+    __ASM volatile ("LDAEXH %0, [%1]" : "=r" (res) : "r" (ptr) : "memory");
+    return ((uint16_t)res);
+  }
+
+  __IAR_FT uint32_t __LDAEX(volatile uint32_t *ptr)
+  {
+    uint32_t res;
+    __ASM volatile ("LDAEX %0, [%1]" : "=r" (res) : "r" (ptr) : "memory");
+    return res;
+  }
+
+  __IAR_FT uint32_t __STLEXB(uint8_t value, volatile uint8_t *ptr)
+  {
+    uint32_t res;
+    __ASM volatile ("STLEXB %0, %2, [%1]" : "=r" (res) : "r" (ptr), "r" (value) : "memory");
+    return res;
+  }
+
+  __IAR_FT uint32_t __STLEXH(uint16_t value, volatile uint16_t *ptr)
+  {
+    uint32_t res;
+    __ASM volatile ("STLEXH %0, %2, [%1]" : "=r" (res) : "r" (ptr), "r" (value) : "memory");
+    return res;
+  }
+
+  __IAR_FT uint32_t __STLEX(uint32_t value, volatile uint32_t *ptr)
+  {
+    uint32_t res;
+    __ASM volatile ("STLEX %0, %2, [%1]" : "=r" (res) : "r" (ptr), "r" (value) : "memory");
+    return res;
+  }
+
+#endif /* __ARM_ARCH_8M_MAIN__ or __ARM_ARCH_8M_BASE__ */
+
+#undef __IAR_FT
+#undef __IAR_M0_FAMILY
+#undef __ICCARM_V8
+
+#pragma diag_default=Pe940
+#pragma diag_default=Pe177
+
+#define __SXTB16_RORn(ARG1, ARG2) __SXTB16(__ROR(ARG1, ARG2))
+
+#endif /* __CMSIS_ICCARM_H__ */

+ 39 - 0
libraries/cmsis/cm4/core_support/cmsis_version.h

@@ -0,0 +1,39 @@
+/******************************************************************************
+ * @file     cmsis_version.h
+ * @brief    CMSIS Core(M) Version definitions
+ * @version  V5.0.4
+ * @date     23. July 2019
+ ******************************************************************************/
+/*
+ * Copyright (c) 2009-2019 ARM Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if   defined ( __ICCARM__ )
+  #pragma system_include         /* treat file as system include file for MISRA check */
+#elif defined (__clang__)
+  #pragma clang system_header   /* treat file as system include file */
+#endif
+
+#ifndef __CMSIS_VERSION_H
+#define __CMSIS_VERSION_H
+
+/*  CMSIS Version definitions */
+#define __CM_CMSIS_VERSION_MAIN  ( 5U)                                      /*!< [31:16] CMSIS Core(M) main version */
+#define __CM_CMSIS_VERSION_SUB   ( 4U)                                      /*!< [15:0]  CMSIS Core(M) sub version */
+#define __CM_CMSIS_VERSION       ((__CM_CMSIS_VERSION_MAIN << 16U) | \
+                                   __CM_CMSIS_VERSION_SUB           )       /*!< CMSIS Core(M) version number */
+#endif

+ 2129 - 0
libraries/cmsis/cm4/core_support/core_cm4.h

@@ -0,0 +1,2129 @@
+/**************************************************************************
+ * @file     core_cm4.h
+ * @brief    CMSIS Cortex-M4 Core Peripheral Access Layer Header File
+ * @version  V5.1.1
+ * @date     27. March 2020
+ ******************************************************************************/
+/*
+ * Copyright (c) 2009-2020 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if   defined ( __ICCARM__ )
+  #pragma system_include         /* treat file as system include file for MISRA check */
+#elif defined (__clang__)
+  #pragma clang system_header   /* treat file as system include file */
+#endif
+
+#ifndef __CORE_CM4_H_GENERIC
+#define __CORE_CM4_H_GENERIC
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+/**
+  \page CMSIS_MISRA_Exceptions  MISRA-C:2004 Compliance Exceptions
+  CMSIS violates the following MISRA-C:2004 rules:
+
+   \li Required Rule 8.5, object/function definition in header file.<br>
+     Function definitions in header files are used to allow 'inlining'.
+
+   \li Required Rule 18.4, declaration of union type or object of union type: '{...}'.<br>
+     Unions are used for effective representation of core registers.
+
+   \li Advisory Rule 19.7, Function-like macro defined.<br>
+     Function-like macros are used to allow more efficient code.
+ */
+
+
+/*******************************************************************************
+ *                 CMSIS definitions
+ ******************************************************************************/
+/**
+  \ingroup Cortex_M4
+  @{
+ */
+
+#include "cmsis_version.h"
+
+/* CMSIS CM4 definitions */
+#define __CM4_CMSIS_VERSION_MAIN  (__CM_CMSIS_VERSION_MAIN)              /*!< \deprecated [31:16] CMSIS HAL main version */
+#define __CM4_CMSIS_VERSION_SUB   (__CM_CMSIS_VERSION_SUB)               /*!< \deprecated [15:0]  CMSIS HAL sub version */
+#define __CM4_CMSIS_VERSION       ((__CM4_CMSIS_VERSION_MAIN << 16U) | \
+                                    __CM4_CMSIS_VERSION_SUB           )  /*!< \deprecated CMSIS HAL version number */
+
+#define __CORTEX_M                (4U)                                   /*!< Cortex-M Core */
+
+/** __FPU_USED indicates whether an FPU is used or not.
+    For this, __FPU_PRESENT has to be checked prior to making use of FPU specific registers and functions.
+*/
+#if defined ( __CC_ARM )
+  #if defined __TARGET_FPU_VFP
+    #if defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)
+      #define __FPU_USED       1U
+    #else
+      #error "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)"
+      #define __FPU_USED       0U
+    #endif
+  #else
+    #define __FPU_USED         0U
+  #endif
+
+#elif defined (__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
+  #if defined __ARM_FP
+    #if defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)
+      #define __FPU_USED       1U
+    #else
+      #warning "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)"
+      #define __FPU_USED       0U
+    #endif
+  #else
+    #define __FPU_USED         0U
+  #endif
+
+#elif defined ( __GNUC__ )
+  #if defined (__VFP_FP__) && !defined(__SOFTFP__)
+    #if defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)
+      #define __FPU_USED       1U
+    #else
+      #error "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)"
+      #define __FPU_USED       0U
+    #endif
+  #else
+    #define __FPU_USED         0U
+  #endif
+
+#elif defined ( __ICCARM__ )
+  #if defined __ARMVFP__
+    #if defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)
+      #define __FPU_USED       1U
+    #else
+      #error "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)"
+      #define __FPU_USED       0U
+    #endif
+  #else
+    #define __FPU_USED         0U
+  #endif
+
+#elif defined ( __TI_ARM__ )
+  #if defined __TI_VFP_SUPPORT__
+    #if defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)
+      #define __FPU_USED       1U
+    #else
+      #error "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)"
+      #define __FPU_USED       0U
+    #endif
+  #else
+    #define __FPU_USED         0U
+  #endif
+
+#elif defined ( __TASKING__ )
+  #if defined __FPU_VFP__
+    #if defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)
+      #define __FPU_USED       1U
+    #else
+      #error "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)"
+      #define __FPU_USED       0U
+    #endif
+  #else
+    #define __FPU_USED         0U
+  #endif
+
+#elif defined ( __CSMC__ )
+  #if ( __CSMC__ & 0x400U)
+    #if defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)
+      #define __FPU_USED       1U
+    #else
+      #error "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)"
+      #define __FPU_USED       0U
+    #endif
+  #else
+    #define __FPU_USED         0U
+  #endif
+
+#endif
+
+#include "cmsis_compiler.h"               /* CMSIS compiler specific defines */
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __CORE_CM4_H_GENERIC */
+
+#ifndef __CMSIS_GENERIC
+
+#ifndef __CORE_CM4_H_DEPENDANT
+#define __CORE_CM4_H_DEPENDANT
+
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+/* check device defines and use defaults */
+#if defined __CHECK_DEVICE_DEFINES
+  #ifndef __CM4_REV
+    #define __CM4_REV               0x0000U
+    #warning "__CM4_REV not defined in device header file; using default!"
+  #endif
+
+  #ifndef __FPU_PRESENT
+    #define __FPU_PRESENT             0U
+    #warning "__FPU_PRESENT not defined in device header file; using default!"
+  #endif
+
+  #ifndef __MPU_PRESENT
+    #define __MPU_PRESENT             0U
+    #warning "__MPU_PRESENT not defined in device header file; using default!"
+  #endif
+
+  #ifndef __VTOR_PRESENT
+    #define __VTOR_PRESENT             1U
+    #warning "__VTOR_PRESENT not defined in device header file; using default!"
+  #endif
+
+  #ifndef __NVIC_PRIO_BITS
+    #define __NVIC_PRIO_BITS          3U
+    #warning "__NVIC_PRIO_BITS not defined in device header file; using default!"
+  #endif
+
+  #ifndef __Vendor_SysTickConfig
+    #define __Vendor_SysTickConfig    0U
+    #warning "__Vendor_SysTickConfig not defined in device header file; using default!"
+  #endif
+#endif
+
+/* IO definitions (access restrictions to peripheral registers) */
+/**
+    \defgroup CMSIS_glob_defs CMSIS Global Defines
+
+    <strong>IO Type Qualifiers</strong> are used
+    \li to specify the access to peripheral variables.
+    \li for automatic generation of peripheral register debug information.
+*/
+#ifdef __cplusplus
+  #define   __I     volatile             /*!< Defines 'read only' permissions */
+#else
+  #define   __I     volatile const       /*!< Defines 'read only' permissions */
+#endif
+#define     __O     volatile             /*!< Defines 'write only' permissions */
+#define     __IO    volatile             /*!< Defines 'read / write' permissions */
+
+/* following defines should be used for structure members */
+#define     __IM     volatile const      /*! Defines 'read only' structure member permissions */
+#define     __OM     volatile            /*! Defines 'write only' structure member permissions */
+#define     __IOM    volatile            /*! Defines 'read / write' structure member permissions */
+
+/*@} end of group Cortex_M4 */
+
+
+
+/*******************************************************************************
+ *                 Register Abstraction
+  Core Register contain:
+  - Core Register
+  - Core NVIC Register
+  - Core SCB Register
+  - Core SysTick Register
+  - Core Debug Register
+  - Core MPU Register
+  - Core FPU Register
+ ******************************************************************************/
+/**
+  \defgroup CMSIS_core_register Defines and Type Definitions
+  \brief Type definitions and defines for Cortex-M processor based devices.
+*/
+
+/**
+  \ingroup    CMSIS_core_register
+  \defgroup   CMSIS_CORE  Status and Control Registers
+  \brief      Core Register type definitions.
+  @{
+ */
+
+/**
+  \brief  Union type to access the Application Program Status Register (APSR).
+ */
+typedef union
+{
+  struct
+  {
+    uint32_t _reserved0:16;              /*!< bit:  0..15  Reserved */
+    uint32_t GE:4;                       /*!< bit: 16..19  Greater than or Equal flags */
+    uint32_t _reserved1:7;               /*!< bit: 20..26  Reserved */
+    uint32_t Q:1;                        /*!< bit:     27  Saturation condition flag */
+    uint32_t V:1;                        /*!< bit:     28  Overflow condition code flag */
+    uint32_t C:1;                        /*!< bit:     29  Carry condition code flag */
+    uint32_t Z:1;                        /*!< bit:     30  Zero condition code flag */
+    uint32_t N:1;                        /*!< bit:     31  Negative condition code flag */
+  } b;                                   /*!< Structure used for bit  access */
+  uint32_t w;                            /*!< Type      used for word access */
+} APSR_Type;
+
+/* APSR Register Definitions */
+#define APSR_N_Pos                         31U                                            /*!< APSR: N Position */
+#define APSR_N_Msk                         (1UL << APSR_N_Pos)                            /*!< APSR: N Mask */
+
+#define APSR_Z_Pos                         30U                                            /*!< APSR: Z Position */
+#define APSR_Z_Msk                         (1UL << APSR_Z_Pos)                            /*!< APSR: Z Mask */
+
+#define APSR_C_Pos                         29U                                            /*!< APSR: C Position */
+#define APSR_C_Msk                         (1UL << APSR_C_Pos)                            /*!< APSR: C Mask */
+
+#define APSR_V_Pos                         28U                                            /*!< APSR: V Position */
+#define APSR_V_Msk                         (1UL << APSR_V_Pos)                            /*!< APSR: V Mask */
+
+#define APSR_Q_Pos                         27U                                            /*!< APSR: Q Position */
+#define APSR_Q_Msk                         (1UL << APSR_Q_Pos)                            /*!< APSR: Q Mask */
+
+#define APSR_GE_Pos                        16U                                            /*!< APSR: GE Position */
+#define APSR_GE_Msk                        (0xFUL << APSR_GE_Pos)                         /*!< APSR: GE Mask */
+
+
+/**
+  \brief  Union type to access the Interrupt Program Status Register (IPSR).
+ */
+typedef union
+{
+  struct
+  {
+    uint32_t ISR:9;                      /*!< bit:  0.. 8  Exception number */
+    uint32_t _reserved0:23;              /*!< bit:  9..31  Reserved */
+  } b;                                   /*!< Structure used for bit  access */
+  uint32_t w;                            /*!< Type      used for word access */
+} IPSR_Type;
+
+/* IPSR Register Definitions */
+#define IPSR_ISR_Pos                        0U                                            /*!< IPSR: ISR Position */
+#define IPSR_ISR_Msk                       (0x1FFUL /*<< IPSR_ISR_Pos*/)                  /*!< IPSR: ISR Mask */
+
+
+/**
+  \brief  Union type to access the Special-Purpose Program Status Registers (xPSR).
+ */
+typedef union
+{
+  struct
+  {
+    uint32_t ISR:9;                      /*!< bit:  0.. 8  Exception number */
+    uint32_t _reserved0:1;               /*!< bit:      9  Reserved */
+    uint32_t ICI_IT_1:6;                 /*!< bit: 10..15  ICI/IT part 1 */
+    uint32_t GE:4;                       /*!< bit: 16..19  Greater than or Equal flags */
+    uint32_t _reserved1:4;               /*!< bit: 20..23  Reserved */
+    uint32_t T:1;                        /*!< bit:     24  Thumb bit */
+    uint32_t ICI_IT_2:2;                 /*!< bit: 25..26  ICI/IT part 2 */
+    uint32_t Q:1;                        /*!< bit:     27  Saturation condition flag */
+    uint32_t V:1;                        /*!< bit:     28  Overflow condition code flag */
+    uint32_t C:1;                        /*!< bit:     29  Carry condition code flag */
+    uint32_t Z:1;                        /*!< bit:     30  Zero condition code flag */
+    uint32_t N:1;                        /*!< bit:     31  Negative condition code flag */
+  } b;                                   /*!< Structure used for bit  access */
+  uint32_t w;                            /*!< Type      used for word access */
+} xPSR_Type;
+
+/* xPSR Register Definitions */
+#define xPSR_N_Pos                         31U                                            /*!< xPSR: N Position */
+#define xPSR_N_Msk                         (1UL << xPSR_N_Pos)                            /*!< xPSR: N Mask */
+
+#define xPSR_Z_Pos                         30U                                            /*!< xPSR: Z Position */
+#define xPSR_Z_Msk                         (1UL << xPSR_Z_Pos)                            /*!< xPSR: Z Mask */
+
+#define xPSR_C_Pos                         29U                                            /*!< xPSR: C Position */
+#define xPSR_C_Msk                         (1UL << xPSR_C_Pos)                            /*!< xPSR: C Mask */
+
+#define xPSR_V_Pos                         28U                                            /*!< xPSR: V Position */
+#define xPSR_V_Msk                         (1UL << xPSR_V_Pos)                            /*!< xPSR: V Mask */
+
+#define xPSR_Q_Pos                         27U                                            /*!< xPSR: Q Position */
+#define xPSR_Q_Msk                         (1UL << xPSR_Q_Pos)                            /*!< xPSR: Q Mask */
+
+#define xPSR_ICI_IT_2_Pos                  25U                                            /*!< xPSR: ICI/IT part 2 Position */
+#define xPSR_ICI_IT_2_Msk                  (3UL << xPSR_ICI_IT_2_Pos)                     /*!< xPSR: ICI/IT part 2 Mask */
+
+#define xPSR_T_Pos                         24U                                            /*!< xPSR: T Position */
+#define xPSR_T_Msk                         (1UL << xPSR_T_Pos)                            /*!< xPSR: T Mask */
+
+#define xPSR_GE_Pos                        16U                                            /*!< xPSR: GE Position */
+#define xPSR_GE_Msk                        (0xFUL << xPSR_GE_Pos)                         /*!< xPSR: GE Mask */
+
+#define xPSR_ICI_IT_1_Pos                  10U                                            /*!< xPSR: ICI/IT part 1 Position */
+#define xPSR_ICI_IT_1_Msk                  (0x3FUL << xPSR_ICI_IT_1_Pos)                  /*!< xPSR: ICI/IT part 1 Mask */
+
+#define xPSR_ISR_Pos                        0U                                            /*!< xPSR: ISR Position */
+#define xPSR_ISR_Msk                       (0x1FFUL /*<< xPSR_ISR_Pos*/)                  /*!< xPSR: ISR Mask */
+
+
+/**
+  \brief  Union type to access the Control Registers (CONTROL).
+ */
+typedef union
+{
+  struct
+  {
+    uint32_t nPRIV:1;                    /*!< bit:      0  Execution privilege in Thread mode */
+    uint32_t SPSEL:1;                    /*!< bit:      1  Stack to be used */
+    uint32_t FPCA:1;                     /*!< bit:      2  FP extension active flag */
+    uint32_t _reserved0:29;              /*!< bit:  3..31  Reserved */
+  } b;                                   /*!< Structure used for bit  access */
+  uint32_t w;                            /*!< Type      used for word access */
+} CONTROL_Type;
+
+/* CONTROL Register Definitions */
+#define CONTROL_FPCA_Pos                    2U                                            /*!< CONTROL: FPCA Position */
+#define CONTROL_FPCA_Msk                   (1UL << CONTROL_FPCA_Pos)                      /*!< CONTROL: FPCA Mask */
+
+#define CONTROL_SPSEL_Pos                   1U                                            /*!< CONTROL: SPSEL Position */
+#define CONTROL_SPSEL_Msk                  (1UL << CONTROL_SPSEL_Pos)                     /*!< CONTROL: SPSEL Mask */
+
+#define CONTROL_nPRIV_Pos                   0U                                            /*!< CONTROL: nPRIV Position */
+#define CONTROL_nPRIV_Msk                  (1UL /*<< CONTROL_nPRIV_Pos*/)                 /*!< CONTROL: nPRIV Mask */
+
+/*@} end of group CMSIS_CORE */
+
+
+/**
+  \ingroup    CMSIS_core_register
+  \defgroup   CMSIS_NVIC  Nested Vectored Interrupt Controller (NVIC)
+  \brief      Type definitions for the NVIC Registers
+  @{
+ */
+
+/**
+  \brief  Structure type to access the Nested Vectored Interrupt Controller (NVIC).
+ */
+typedef struct
+{
+  __IOM uint32_t ISER[8U];               /*!< Offset: 0x000 (R/W)  Interrupt Set Enable Register */
+        uint32_t RESERVED0[24U];
+  __IOM uint32_t ICER[8U];               /*!< Offset: 0x080 (R/W)  Interrupt Clear Enable Register */
+        uint32_t RESERVED1[24U];
+  __IOM uint32_t ISPR[8U];               /*!< Offset: 0x100 (R/W)  Interrupt Set Pending Register */
+        uint32_t RESERVED2[24U];
+  __IOM uint32_t ICPR[8U];               /*!< Offset: 0x180 (R/W)  Interrupt Clear Pending Register */
+        uint32_t RESERVED3[24U];
+  __IOM uint32_t IABR[8U];               /*!< Offset: 0x200 (R/W)  Interrupt Active bit Register */
+        uint32_t RESERVED4[56U];
+  __IOM uint8_t  IP[240U];               /*!< Offset: 0x300 (R/W)  Interrupt Priority Register (8Bit wide) */
+        uint32_t RESERVED5[644U];
+  __OM  uint32_t STIR;                   /*!< Offset: 0xE00 ( /W)  Software Trigger Interrupt Register */
+}  NVIC_Type;
+
+/* Software Triggered Interrupt Register Definitions */
+#define NVIC_STIR_INTID_Pos                 0U                                         /*!< STIR: INTLINESNUM Position */
+#define NVIC_STIR_INTID_Msk                (0x1FFUL /*<< NVIC_STIR_INTID_Pos*/)        /*!< STIR: INTLINESNUM Mask */
+
+/*@} end of group CMSIS_NVIC */
+
+
+/**
+  \ingroup  CMSIS_core_register
+  \defgroup CMSIS_SCB     System Control Block (SCB)
+  \brief    Type definitions for the System Control Block Registers
+  @{
+ */
+
+/**
+  \brief  Structure type to access the System Control Block (SCB).
+ */
+typedef struct
+{
+  __IM  uint32_t CPUID;                  /*!< Offset: 0x000 (R/ )  CPUID Base Register */
+  __IOM uint32_t ICSR;                   /*!< Offset: 0x004 (R/W)  Interrupt Control and State Register */
+  __IOM uint32_t VTOR;                   /*!< Offset: 0x008 (R/W)  Vector Table Offset Register */
+  __IOM uint32_t AIRCR;                  /*!< Offset: 0x00C (R/W)  Application Interrupt and Reset Control Register */
+  __IOM uint32_t SCR;                    /*!< Offset: 0x010 (R/W)  System Control Register */
+  __IOM uint32_t CCR;                    /*!< Offset: 0x014 (R/W)  Configuration Control Register */
+  __IOM uint8_t  SHP[12U];               /*!< Offset: 0x018 (R/W)  System Handlers Priority Registers (4-7, 8-11, 12-15) */
+  __IOM uint32_t SHCSR;                  /*!< Offset: 0x024 (R/W)  System Handler Control and State Register */
+  __IOM uint32_t CFSR;                   /*!< Offset: 0x028 (R/W)  Configurable Fault Status Register */
+  __IOM uint32_t HFSR;                   /*!< Offset: 0x02C (R/W)  HardFault Status Register */
+  __IOM uint32_t DFSR;                   /*!< Offset: 0x030 (R/W)  Debug Fault Status Register */
+  __IOM uint32_t MMFAR;                  /*!< Offset: 0x034 (R/W)  MemManage Fault Address Register */
+  __IOM uint32_t BFAR;                   /*!< Offset: 0x038 (R/W)  BusFault Address Register */
+  __IOM uint32_t AFSR;                   /*!< Offset: 0x03C (R/W)  Auxiliary Fault Status Register */
+  __IM  uint32_t PFR[2U];                /*!< Offset: 0x040 (R/ )  Processor Feature Register */
+  __IM  uint32_t DFR;                    /*!< Offset: 0x048 (R/ )  Debug Feature Register */
+  __IM  uint32_t ADR;                    /*!< Offset: 0x04C (R/ )  Auxiliary Feature Register */
+  __IM  uint32_t MMFR[4U];               /*!< Offset: 0x050 (R/ )  Memory Model Feature Register */
+  __IM  uint32_t ISAR[5U];               /*!< Offset: 0x060 (R/ )  Instruction Set Attributes Register */
+        uint32_t RESERVED0[5U];
+  __IOM uint32_t CPACR;                  /*!< Offset: 0x088 (R/W)  Coprocessor Access Control Register */
+} SCB_Type;
+
+/* SCB CPUID Register Definitions */
+#define SCB_CPUID_IMPLEMENTER_Pos          24U                                            /*!< SCB CPUID: IMPLEMENTER Position */
+#define SCB_CPUID_IMPLEMENTER_Msk          (0xFFUL << SCB_CPUID_IMPLEMENTER_Pos)          /*!< SCB CPUID: IMPLEMENTER Mask */
+
+#define SCB_CPUID_VARIANT_Pos              20U                                            /*!< SCB CPUID: VARIANT Position */
+#define SCB_CPUID_VARIANT_Msk              (0xFUL << SCB_CPUID_VARIANT_Pos)               /*!< SCB CPUID: VARIANT Mask */
+
+#define SCB_CPUID_ARCHITECTURE_Pos         16U                                            /*!< SCB CPUID: ARCHITECTURE Position */
+#define SCB_CPUID_ARCHITECTURE_Msk         (0xFUL << SCB_CPUID_ARCHITECTURE_Pos)          /*!< SCB CPUID: ARCHITECTURE Mask */
+
+#define SCB_CPUID_PARTNO_Pos                4U                                            /*!< SCB CPUID: PARTNO Position */
+#define SCB_CPUID_PARTNO_Msk               (0xFFFUL << SCB_CPUID_PARTNO_Pos)              /*!< SCB CPUID: PARTNO Mask */
+
+#define SCB_CPUID_REVISION_Pos              0U                                            /*!< SCB CPUID: REVISION Position */
+#define SCB_CPUID_REVISION_Msk             (0xFUL /*<< SCB_CPUID_REVISION_Pos*/)          /*!< SCB CPUID: REVISION Mask */
+
+/* SCB Interrupt Control State Register Definitions */
+#define SCB_ICSR_NMIPENDSET_Pos            31U                                            /*!< SCB ICSR: NMIPENDSET Position */
+#define SCB_ICSR_NMIPENDSET_Msk            (1UL << SCB_ICSR_NMIPENDSET_Pos)               /*!< SCB ICSR: NMIPENDSET Mask */
+
+#define SCB_ICSR_PENDSVSET_Pos             28U                                            /*!< SCB ICSR: PENDSVSET Position */
+#define SCB_ICSR_PENDSVSET_Msk             (1UL << SCB_ICSR_PENDSVSET_Pos)                /*!< SCB ICSR: PENDSVSET Mask */
+
+#define SCB_ICSR_PENDSVCLR_Pos             27U                                            /*!< SCB ICSR: PENDSVCLR Position */
+#define SCB_ICSR_PENDSVCLR_Msk             (1UL << SCB_ICSR_PENDSVCLR_Pos)                /*!< SCB ICSR: PENDSVCLR Mask */
+
+#define SCB_ICSR_PENDSTSET_Pos             26U                                            /*!< SCB ICSR: PENDSTSET Position */
+#define SCB_ICSR_PENDSTSET_Msk             (1UL << SCB_ICSR_PENDSTSET_Pos)                /*!< SCB ICSR: PENDSTSET Mask */
+
+#define SCB_ICSR_PENDSTCLR_Pos             25U                                            /*!< SCB ICSR: PENDSTCLR Position */
+#define SCB_ICSR_PENDSTCLR_Msk             (1UL << SCB_ICSR_PENDSTCLR_Pos)                /*!< SCB ICSR: PENDSTCLR Mask */
+
+#define SCB_ICSR_ISRPREEMPT_Pos            23U                                            /*!< SCB ICSR: ISRPREEMPT Position */
+#define SCB_ICSR_ISRPREEMPT_Msk            (1UL << SCB_ICSR_ISRPREEMPT_Pos)               /*!< SCB ICSR: ISRPREEMPT Mask */
+
+#define SCB_ICSR_ISRPENDING_Pos            22U                                            /*!< SCB ICSR: ISRPENDING Position */
+#define SCB_ICSR_ISRPENDING_Msk            (1UL << SCB_ICSR_ISRPENDING_Pos)               /*!< SCB ICSR: ISRPENDING Mask */
+
+#define SCB_ICSR_VECTPENDING_Pos           12U                                            /*!< SCB ICSR: VECTPENDING Position */
+#define SCB_ICSR_VECTPENDING_Msk           (0x1FFUL << SCB_ICSR_VECTPENDING_Pos)          /*!< SCB ICSR: VECTPENDING Mask */
+
+#define SCB_ICSR_RETTOBASE_Pos             11U                                            /*!< SCB ICSR: RETTOBASE Position */
+#define SCB_ICSR_RETTOBASE_Msk             (1UL << SCB_ICSR_RETTOBASE_Pos)                /*!< SCB ICSR: RETTOBASE Mask */
+
+#define SCB_ICSR_VECTACTIVE_Pos             0U                                            /*!< SCB ICSR: VECTACTIVE Position */
+#define SCB_ICSR_VECTACTIVE_Msk            (0x1FFUL /*<< SCB_ICSR_VECTACTIVE_Pos*/)       /*!< SCB ICSR: VECTACTIVE Mask */
+
+/* SCB Vector Table Offset Register Definitions */
+#define SCB_VTOR_TBLOFF_Pos                 7U                                            /*!< SCB VTOR: TBLOFF Position */
+#define SCB_VTOR_TBLOFF_Msk                (0x1FFFFFFUL << SCB_VTOR_TBLOFF_Pos)           /*!< SCB VTOR: TBLOFF Mask */
+
+/* SCB Application Interrupt and Reset Control Register Definitions */
+#define SCB_AIRCR_VECTKEY_Pos              16U                                            /*!< SCB AIRCR: VECTKEY Position */
+#define SCB_AIRCR_VECTKEY_Msk              (0xFFFFUL << SCB_AIRCR_VECTKEY_Pos)            /*!< SCB AIRCR: VECTKEY Mask */
+
+#define SCB_AIRCR_VECTKEYSTAT_Pos          16U                                            /*!< SCB AIRCR: VECTKEYSTAT Position */
+#define SCB_AIRCR_VECTKEYSTAT_Msk          (0xFFFFUL << SCB_AIRCR_VECTKEYSTAT_Pos)        /*!< SCB AIRCR: VECTKEYSTAT Mask */
+
+#define SCB_AIRCR_ENDIANESS_Pos            15U                                            /*!< SCB AIRCR: ENDIANESS Position */
+#define SCB_AIRCR_ENDIANESS_Msk            (1UL << SCB_AIRCR_ENDIANESS_Pos)               /*!< SCB AIRCR: ENDIANESS Mask */
+
+#define SCB_AIRCR_PRIGROUP_Pos              8U                                            /*!< SCB AIRCR: PRIGROUP Position */
+#define SCB_AIRCR_PRIGROUP_Msk             (7UL << SCB_AIRCR_PRIGROUP_Pos)                /*!< SCB AIRCR: PRIGROUP Mask */
+
+#define SCB_AIRCR_SYSRESETREQ_Pos           2U                                            /*!< SCB AIRCR: SYSRESETREQ Position */
+#define SCB_AIRCR_SYSRESETREQ_Msk          (1UL << SCB_AIRCR_SYSRESETREQ_Pos)             /*!< SCB AIRCR: SYSRESETREQ Mask */
+
+#define SCB_AIRCR_VECTCLRACTIVE_Pos         1U                                            /*!< SCB AIRCR: VECTCLRACTIVE Position */
+#define SCB_AIRCR_VECTCLRACTIVE_Msk        (1UL << SCB_AIRCR_VECTCLRACTIVE_Pos)           /*!< SCB AIRCR: VECTCLRACTIVE Mask */
+
+#define SCB_AIRCR_VECTRESET_Pos             0U                                            /*!< SCB AIRCR: VECTRESET Position */
+#define SCB_AIRCR_VECTRESET_Msk            (1UL /*<< SCB_AIRCR_VECTRESET_Pos*/)           /*!< SCB AIRCR: VECTRESET Mask */
+
+/* SCB System Control Register Definitions */
+#define SCB_SCR_SEVONPEND_Pos               4U                                            /*!< SCB SCR: SEVONPEND Position */
+#define SCB_SCR_SEVONPEND_Msk              (1UL << SCB_SCR_SEVONPEND_Pos)                 /*!< SCB SCR: SEVONPEND Mask */
+
+#define SCB_SCR_SLEEPDEEP_Pos               2U                                            /*!< SCB SCR: SLEEPDEEP Position */
+#define SCB_SCR_SLEEPDEEP_Msk              (1UL << SCB_SCR_SLEEPDEEP_Pos)                 /*!< SCB SCR: SLEEPDEEP Mask */
+
+#define SCB_SCR_SLEEPONEXIT_Pos             1U                                            /*!< SCB SCR: SLEEPONEXIT Position */
+#define SCB_SCR_SLEEPONEXIT_Msk            (1UL << SCB_SCR_SLEEPONEXIT_Pos)               /*!< SCB SCR: SLEEPONEXIT Mask */
+
+/* SCB Configuration Control Register Definitions */
+#define SCB_CCR_STKALIGN_Pos                9U                                            /*!< SCB CCR: STKALIGN Position */
+#define SCB_CCR_STKALIGN_Msk               (1UL << SCB_CCR_STKALIGN_Pos)                  /*!< SCB CCR: STKALIGN Mask */
+
+#define SCB_CCR_BFHFNMIGN_Pos               8U                                            /*!< SCB CCR: BFHFNMIGN Position */
+#define SCB_CCR_BFHFNMIGN_Msk              (1UL << SCB_CCR_BFHFNMIGN_Pos)                 /*!< SCB CCR: BFHFNMIGN Mask */
+
+#define SCB_CCR_DIV_0_TRP_Pos               4U                                            /*!< SCB CCR: DIV_0_TRP Position */
+#define SCB_CCR_DIV_0_TRP_Msk              (1UL << SCB_CCR_DIV_0_TRP_Pos)                 /*!< SCB CCR: DIV_0_TRP Mask */
+
+#define SCB_CCR_UNALIGN_TRP_Pos             3U                                            /*!< SCB CCR: UNALIGN_TRP Position */
+#define SCB_CCR_UNALIGN_TRP_Msk            (1UL << SCB_CCR_UNALIGN_TRP_Pos)               /*!< SCB CCR: UNALIGN_TRP Mask */
+
+#define SCB_CCR_USERSETMPEND_Pos            1U                                            /*!< SCB CCR: USERSETMPEND Position */
+#define SCB_CCR_USERSETMPEND_Msk           (1UL << SCB_CCR_USERSETMPEND_Pos)              /*!< SCB CCR: USERSETMPEND Mask */
+
+#define SCB_CCR_NONBASETHRDENA_Pos          0U                                            /*!< SCB CCR: NONBASETHRDENA Position */
+#define SCB_CCR_NONBASETHRDENA_Msk         (1UL /*<< SCB_CCR_NONBASETHRDENA_Pos*/)        /*!< SCB CCR: NONBASETHRDENA Mask */
+
+/* SCB System Handler Control and State Register Definitions */
+#define SCB_SHCSR_USGFAULTENA_Pos          18U                                            /*!< SCB SHCSR: USGFAULTENA Position */
+#define SCB_SHCSR_USGFAULTENA_Msk          (1UL << SCB_SHCSR_USGFAULTENA_Pos)             /*!< SCB SHCSR: USGFAULTENA Mask */
+
+#define SCB_SHCSR_BUSFAULTENA_Pos          17U                                            /*!< SCB SHCSR: BUSFAULTENA Position */
+#define SCB_SHCSR_BUSFAULTENA_Msk          (1UL << SCB_SHCSR_BUSFAULTENA_Pos)             /*!< SCB SHCSR: BUSFAULTENA Mask */
+
+#define SCB_SHCSR_MEMFAULTENA_Pos          16U                                            /*!< SCB SHCSR: MEMFAULTENA Position */
+#define SCB_SHCSR_MEMFAULTENA_Msk          (1UL << SCB_SHCSR_MEMFAULTENA_Pos)             /*!< SCB SHCSR: MEMFAULTENA Mask */
+
+#define SCB_SHCSR_SVCALLPENDED_Pos         15U                                            /*!< SCB SHCSR: SVCALLPENDED Position */
+#define SCB_SHCSR_SVCALLPENDED_Msk         (1UL << SCB_SHCSR_SVCALLPENDED_Pos)            /*!< SCB SHCSR: SVCALLPENDED Mask */
+
+#define SCB_SHCSR_BUSFAULTPENDED_Pos       14U                                            /*!< SCB SHCSR: BUSFAULTPENDED Position */
+#define SCB_SHCSR_BUSFAULTPENDED_Msk       (1UL << SCB_SHCSR_BUSFAULTPENDED_Pos)          /*!< SCB SHCSR: BUSFAULTPENDED Mask */
+
+#define SCB_SHCSR_MEMFAULTPENDED_Pos       13U                                            /*!< SCB SHCSR: MEMFAULTPENDED Position */
+#define SCB_SHCSR_MEMFAULTPENDED_Msk       (1UL << SCB_SHCSR_MEMFAULTPENDED_Pos)          /*!< SCB SHCSR: MEMFAULTPENDED Mask */
+
+#define SCB_SHCSR_USGFAULTPENDED_Pos       12U                                            /*!< SCB SHCSR: USGFAULTPENDED Position */
+#define SCB_SHCSR_USGFAULTPENDED_Msk       (1UL << SCB_SHCSR_USGFAULTPENDED_Pos)          /*!< SCB SHCSR: USGFAULTPENDED Mask */
+
+#define SCB_SHCSR_SYSTICKACT_Pos           11U                                            /*!< SCB SHCSR: SYSTICKACT Position */
+#define SCB_SHCSR_SYSTICKACT_Msk           (1UL << SCB_SHCSR_SYSTICKACT_Pos)              /*!< SCB SHCSR: SYSTICKACT Mask */
+
+#define SCB_SHCSR_PENDSVACT_Pos            10U                                            /*!< SCB SHCSR: PENDSVACT Position */
+#define SCB_SHCSR_PENDSVACT_Msk            (1UL << SCB_SHCSR_PENDSVACT_Pos)               /*!< SCB SHCSR: PENDSVACT Mask */
+
+#define SCB_SHCSR_MONITORACT_Pos            8U                                            /*!< SCB SHCSR: MONITORACT Position */
+#define SCB_SHCSR_MONITORACT_Msk           (1UL << SCB_SHCSR_MONITORACT_Pos)              /*!< SCB SHCSR: MONITORACT Mask */
+
+#define SCB_SHCSR_SVCALLACT_Pos             7U                                            /*!< SCB SHCSR: SVCALLACT Position */
+#define SCB_SHCSR_SVCALLACT_Msk            (1UL << SCB_SHCSR_SVCALLACT_Pos)               /*!< SCB SHCSR: SVCALLACT Mask */
+
+#define SCB_SHCSR_USGFAULTACT_Pos           3U                                            /*!< SCB SHCSR: USGFAULTACT Position */
+#define SCB_SHCSR_USGFAULTACT_Msk          (1UL << SCB_SHCSR_USGFAULTACT_Pos)             /*!< SCB SHCSR: USGFAULTACT Mask */
+
+#define SCB_SHCSR_BUSFAULTACT_Pos           1U                                            /*!< SCB SHCSR: BUSFAULTACT Position */
+#define SCB_SHCSR_BUSFAULTACT_Msk          (1UL << SCB_SHCSR_BUSFAULTACT_Pos)             /*!< SCB SHCSR: BUSFAULTACT Mask */
+
+#define SCB_SHCSR_MEMFAULTACT_Pos           0U                                            /*!< SCB SHCSR: MEMFAULTACT Position */
+#define SCB_SHCSR_MEMFAULTACT_Msk          (1UL /*<< SCB_SHCSR_MEMFAULTACT_Pos*/)         /*!< SCB SHCSR: MEMFAULTACT Mask */
+
+/* SCB Configurable Fault Status Register Definitions */
+#define SCB_CFSR_USGFAULTSR_Pos            16U                                            /*!< SCB CFSR: Usage Fault Status Register Position */
+#define SCB_CFSR_USGFAULTSR_Msk            (0xFFFFUL << SCB_CFSR_USGFAULTSR_Pos)          /*!< SCB CFSR: Usage Fault Status Register Mask */
+
+#define SCB_CFSR_BUSFAULTSR_Pos             8U                                            /*!< SCB CFSR: Bus Fault Status Register Position */
+#define SCB_CFSR_BUSFAULTSR_Msk            (0xFFUL << SCB_CFSR_BUSFAULTSR_Pos)            /*!< SCB CFSR: Bus Fault Status Register Mask */
+
+#define SCB_CFSR_MEMFAULTSR_Pos             0U                                            /*!< SCB CFSR: Memory Manage Fault Status Register Position */
+#define SCB_CFSR_MEMFAULTSR_Msk            (0xFFUL /*<< SCB_CFSR_MEMFAULTSR_Pos*/)        /*!< SCB CFSR: Memory Manage Fault Status Register Mask */
+
+/* MemManage Fault Status Register (part of SCB Configurable Fault Status Register) */
+#define SCB_CFSR_MMARVALID_Pos             (SCB_SHCSR_MEMFAULTACT_Pos + 7U)               /*!< SCB CFSR (MMFSR): MMARVALID Position */
+#define SCB_CFSR_MMARVALID_Msk             (1UL << SCB_CFSR_MMARVALID_Pos)                /*!< SCB CFSR (MMFSR): MMARVALID Mask */
+
+#define SCB_CFSR_MLSPERR_Pos               (SCB_SHCSR_MEMFAULTACT_Pos + 5U)               /*!< SCB CFSR (MMFSR): MLSPERR Position */
+#define SCB_CFSR_MLSPERR_Msk               (1UL << SCB_CFSR_MLSPERR_Pos)                  /*!< SCB CFSR (MMFSR): MLSPERR Mask */
+
+#define SCB_CFSR_MSTKERR_Pos               (SCB_SHCSR_MEMFAULTACT_Pos + 4U)               /*!< SCB CFSR (MMFSR): MSTKERR Position */
+#define SCB_CFSR_MSTKERR_Msk               (1UL << SCB_CFSR_MSTKERR_Pos)                  /*!< SCB CFSR (MMFSR): MSTKERR Mask */
+
+#define SCB_CFSR_MUNSTKERR_Pos             (SCB_SHCSR_MEMFAULTACT_Pos + 3U)               /*!< SCB CFSR (MMFSR): MUNSTKERR Position */
+#define SCB_CFSR_MUNSTKERR_Msk             (1UL << SCB_CFSR_MUNSTKERR_Pos)                /*!< SCB CFSR (MMFSR): MUNSTKERR Mask */
+
+#define SCB_CFSR_DACCVIOL_Pos              (SCB_SHCSR_MEMFAULTACT_Pos + 1U)               /*!< SCB CFSR (MMFSR): DACCVIOL Position */
+#define SCB_CFSR_DACCVIOL_Msk              (1UL << SCB_CFSR_DACCVIOL_Pos)                 /*!< SCB CFSR (MMFSR): DACCVIOL Mask */
+
+#define SCB_CFSR_IACCVIOL_Pos              (SCB_SHCSR_MEMFAULTACT_Pos + 0U)               /*!< SCB CFSR (MMFSR): IACCVIOL Position */
+#define SCB_CFSR_IACCVIOL_Msk              (1UL /*<< SCB_CFSR_IACCVIOL_Pos*/)             /*!< SCB CFSR (MMFSR): IACCVIOL Mask */
+
+/* BusFault Status Register (part of SCB Configurable Fault Status Register) */
+#define SCB_CFSR_BFARVALID_Pos            (SCB_CFSR_BUSFAULTSR_Pos + 7U)                  /*!< SCB CFSR (BFSR): BFARVALID Position */
+#define SCB_CFSR_BFARVALID_Msk            (1UL << SCB_CFSR_BFARVALID_Pos)                 /*!< SCB CFSR (BFSR): BFARVALID Mask */
+
+#define SCB_CFSR_LSPERR_Pos               (SCB_CFSR_BUSFAULTSR_Pos + 5U)                  /*!< SCB CFSR (BFSR): LSPERR Position */
+#define SCB_CFSR_LSPERR_Msk               (1UL << SCB_CFSR_LSPERR_Pos)                    /*!< SCB CFSR (BFSR): LSPERR Mask */
+
+#define SCB_CFSR_STKERR_Pos               (SCB_CFSR_BUSFAULTSR_Pos + 4U)                  /*!< SCB CFSR (BFSR): STKERR Position */
+#define SCB_CFSR_STKERR_Msk               (1UL << SCB_CFSR_STKERR_Pos)                    /*!< SCB CFSR (BFSR): STKERR Mask */
+
+#define SCB_CFSR_UNSTKERR_Pos             (SCB_CFSR_BUSFAULTSR_Pos + 3U)                  /*!< SCB CFSR (BFSR): UNSTKERR Position */
+#define SCB_CFSR_UNSTKERR_Msk             (1UL << SCB_CFSR_UNSTKERR_Pos)                  /*!< SCB CFSR (BFSR): UNSTKERR Mask */
+
+#define SCB_CFSR_IMPRECISERR_Pos          (SCB_CFSR_BUSFAULTSR_Pos + 2U)                  /*!< SCB CFSR (BFSR): IMPRECISERR Position */
+#define SCB_CFSR_IMPRECISERR_Msk          (1UL << SCB_CFSR_IMPRECISERR_Pos)               /*!< SCB CFSR (BFSR): IMPRECISERR Mask */
+
+#define SCB_CFSR_PRECISERR_Pos            (SCB_CFSR_BUSFAULTSR_Pos + 1U)                  /*!< SCB CFSR (BFSR): PRECISERR Position */
+#define SCB_CFSR_PRECISERR_Msk            (1UL << SCB_CFSR_PRECISERR_Pos)                 /*!< SCB CFSR (BFSR): PRECISERR Mask */
+
+#define SCB_CFSR_IBUSERR_Pos              (SCB_CFSR_BUSFAULTSR_Pos + 0U)                  /*!< SCB CFSR (BFSR): IBUSERR Position */
+#define SCB_CFSR_IBUSERR_Msk              (1UL << SCB_CFSR_IBUSERR_Pos)                   /*!< SCB CFSR (BFSR): IBUSERR Mask */
+
+/* UsageFault Status Register (part of SCB Configurable Fault Status Register) */
+#define SCB_CFSR_DIVBYZERO_Pos            (SCB_CFSR_USGFAULTSR_Pos + 9U)                  /*!< SCB CFSR (UFSR): DIVBYZERO Position */
+#define SCB_CFSR_DIVBYZERO_Msk            (1UL << SCB_CFSR_DIVBYZERO_Pos)                 /*!< SCB CFSR (UFSR): DIVBYZERO Mask */
+
+#define SCB_CFSR_UNALIGNED_Pos            (SCB_CFSR_USGFAULTSR_Pos + 8U)                  /*!< SCB CFSR (UFSR): UNALIGNED Position */
+#define SCB_CFSR_UNALIGNED_Msk            (1UL << SCB_CFSR_UNALIGNED_Pos)                 /*!< SCB CFSR (UFSR): UNALIGNED Mask */
+
+#define SCB_CFSR_NOCP_Pos                 (SCB_CFSR_USGFAULTSR_Pos + 3U)                  /*!< SCB CFSR (UFSR): NOCP Position */
+#define SCB_CFSR_NOCP_Msk                 (1UL << SCB_CFSR_NOCP_Pos)                      /*!< SCB CFSR (UFSR): NOCP Mask */
+
+#define SCB_CFSR_INVPC_Pos                (SCB_CFSR_USGFAULTSR_Pos + 2U)                  /*!< SCB CFSR (UFSR): INVPC Position */
+#define SCB_CFSR_INVPC_Msk                (1UL << SCB_CFSR_INVPC_Pos)                     /*!< SCB CFSR (UFSR): INVPC Mask */
+
+#define SCB_CFSR_INVSTATE_Pos             (SCB_CFSR_USGFAULTSR_Pos + 1U)                  /*!< SCB CFSR (UFSR): INVSTATE Position */
+#define SCB_CFSR_INVSTATE_Msk             (1UL << SCB_CFSR_INVSTATE_Pos)                  /*!< SCB CFSR (UFSR): INVSTATE Mask */
+
+#define SCB_CFSR_UNDEFINSTR_Pos           (SCB_CFSR_USGFAULTSR_Pos + 0U)                  /*!< SCB CFSR (UFSR): UNDEFINSTR Position */
+#define SCB_CFSR_UNDEFINSTR_Msk           (1UL << SCB_CFSR_UNDEFINSTR_Pos)                /*!< SCB CFSR (UFSR): UNDEFINSTR Mask */
+
+/* SCB Hard Fault Status Register Definitions */
+#define SCB_HFSR_DEBUGEVT_Pos              31U                                            /*!< SCB HFSR: DEBUGEVT Position */
+#define SCB_HFSR_DEBUGEVT_Msk              (1UL << SCB_HFSR_DEBUGEVT_Pos)                 /*!< SCB HFSR: DEBUGEVT Mask */
+
+#define SCB_HFSR_FORCED_Pos                30U                                            /*!< SCB HFSR: FORCED Position */
+#define SCB_HFSR_FORCED_Msk                (1UL << SCB_HFSR_FORCED_Pos)                   /*!< SCB HFSR: FORCED Mask */
+
+#define SCB_HFSR_VECTTBL_Pos                1U                                            /*!< SCB HFSR: VECTTBL Position */
+#define SCB_HFSR_VECTTBL_Msk               (1UL << SCB_HFSR_VECTTBL_Pos)                  /*!< SCB HFSR: VECTTBL Mask */
+
+/* SCB Debug Fault Status Register Definitions */
+#define SCB_DFSR_EXTERNAL_Pos               4U                                            /*!< SCB DFSR: EXTERNAL Position */
+#define SCB_DFSR_EXTERNAL_Msk              (1UL << SCB_DFSR_EXTERNAL_Pos)                 /*!< SCB DFSR: EXTERNAL Mask */
+
+#define SCB_DFSR_VCATCH_Pos                 3U                                            /*!< SCB DFSR: VCATCH Position */
+#define SCB_DFSR_VCATCH_Msk                (1UL << SCB_DFSR_VCATCH_Pos)                   /*!< SCB DFSR: VCATCH Mask */
+
+#define SCB_DFSR_DWTTRAP_Pos                2U                                            /*!< SCB DFSR: DWTTRAP Position */
+#define SCB_DFSR_DWTTRAP_Msk               (1UL << SCB_DFSR_DWTTRAP_Pos)                  /*!< SCB DFSR: DWTTRAP Mask */
+
+#define SCB_DFSR_BKPT_Pos                   1U                                            /*!< SCB DFSR: BKPT Position */
+#define SCB_DFSR_BKPT_Msk                  (1UL << SCB_DFSR_BKPT_Pos)                     /*!< SCB DFSR: BKPT Mask */
+
+#define SCB_DFSR_HALTED_Pos                 0U                                            /*!< SCB DFSR: HALTED Position */
+#define SCB_DFSR_HALTED_Msk                (1UL /*<< SCB_DFSR_HALTED_Pos*/)               /*!< SCB DFSR: HALTED Mask */
+
+/*@} end of group CMSIS_SCB */
+
+
+/**
+  \ingroup  CMSIS_core_register
+  \defgroup CMSIS_SCnSCB System Controls not in SCB (SCnSCB)
+  \brief    Type definitions for the System Control and ID Register not in the SCB
+  @{
+ */
+
+/**
+  \brief  Structure type to access the System Control and ID Register not in the SCB.
+ */
+typedef struct
+{
+        uint32_t RESERVED0[1U];
+  __IM  uint32_t ICTR;                   /*!< Offset: 0x004 (R/ )  Interrupt Controller Type Register */
+  __IOM uint32_t ACTLR;                  /*!< Offset: 0x008 (R/W)  Auxiliary Control Register */
+} SCnSCB_Type;
+
+/* Interrupt Controller Type Register Definitions */
+#define SCnSCB_ICTR_INTLINESNUM_Pos         0U                                         /*!< ICTR: INTLINESNUM Position */
+#define SCnSCB_ICTR_INTLINESNUM_Msk        (0xFUL /*<< SCnSCB_ICTR_INTLINESNUM_Pos*/)  /*!< ICTR: INTLINESNUM Mask */
+
+/* Auxiliary Control Register Definitions */
+#define SCnSCB_ACTLR_DISOOFP_Pos            9U                                         /*!< ACTLR: DISOOFP Position */
+#define SCnSCB_ACTLR_DISOOFP_Msk           (1UL << SCnSCB_ACTLR_DISOOFP_Pos)           /*!< ACTLR: DISOOFP Mask */
+
+#define SCnSCB_ACTLR_DISFPCA_Pos            8U                                         /*!< ACTLR: DISFPCA Position */
+#define SCnSCB_ACTLR_DISFPCA_Msk           (1UL << SCnSCB_ACTLR_DISFPCA_Pos)           /*!< ACTLR: DISFPCA Mask */
+
+#define SCnSCB_ACTLR_DISFOLD_Pos            2U                                         /*!< ACTLR: DISFOLD Position */
+#define SCnSCB_ACTLR_DISFOLD_Msk           (1UL << SCnSCB_ACTLR_DISFOLD_Pos)           /*!< ACTLR: DISFOLD Mask */
+
+#define SCnSCB_ACTLR_DISDEFWBUF_Pos         1U                                         /*!< ACTLR: DISDEFWBUF Position */
+#define SCnSCB_ACTLR_DISDEFWBUF_Msk        (1UL << SCnSCB_ACTLR_DISDEFWBUF_Pos)        /*!< ACTLR: DISDEFWBUF Mask */
+
+#define SCnSCB_ACTLR_DISMCYCINT_Pos         0U                                         /*!< ACTLR: DISMCYCINT Position */
+#define SCnSCB_ACTLR_DISMCYCINT_Msk        (1UL /*<< SCnSCB_ACTLR_DISMCYCINT_Pos*/)    /*!< ACTLR: DISMCYCINT Mask */
+
+/*@} end of group CMSIS_SCnotSCB */
+
+
+/**
+  \ingroup  CMSIS_core_register
+  \defgroup CMSIS_SysTick     System Tick Timer (SysTick)
+  \brief    Type definitions for the System Timer Registers.
+  @{
+ */
+
+/**
+  \brief  Structure type to access the System Timer (SysTick).
+ */
+typedef struct
+{
+  __IOM uint32_t CTRL;                   /*!< Offset: 0x000 (R/W)  SysTick Control and Status Register */
+  __IOM uint32_t LOAD;                   /*!< Offset: 0x004 (R/W)  SysTick Reload Value Register */
+  __IOM uint32_t VAL;                    /*!< Offset: 0x008 (R/W)  SysTick Current Value Register */
+  __IM  uint32_t CALIB;                  /*!< Offset: 0x00C (R/ )  SysTick Calibration Register */
+} SysTick_Type;
+
+/* SysTick Control / Status Register Definitions */
+#define SysTick_CTRL_COUNTFLAG_Pos         16U                                            /*!< SysTick CTRL: COUNTFLAG Position */
+#define SysTick_CTRL_COUNTFLAG_Msk         (1UL << SysTick_CTRL_COUNTFLAG_Pos)            /*!< SysTick CTRL: COUNTFLAG Mask */
+
+#define SysTick_CTRL_CLKSOURCE_Pos          2U                                            /*!< SysTick CTRL: CLKSOURCE Position */
+#define SysTick_CTRL_CLKSOURCE_Msk         (1UL << SysTick_CTRL_CLKSOURCE_Pos)            /*!< SysTick CTRL: CLKSOURCE Mask */
+
+#define SysTick_CTRL_TICKINT_Pos            1U                                            /*!< SysTick CTRL: TICKINT Position */
+#define SysTick_CTRL_TICKINT_Msk           (1UL << SysTick_CTRL_TICKINT_Pos)              /*!< SysTick CTRL: TICKINT Mask */
+
+#define SysTick_CTRL_ENABLE_Pos             0U                                            /*!< SysTick CTRL: ENABLE Position */
+#define SysTick_CTRL_ENABLE_Msk            (1UL /*<< SysTick_CTRL_ENABLE_Pos*/)           /*!< SysTick CTRL: ENABLE Mask */
+
+/* SysTick Reload Register Definitions */
+#define SysTick_LOAD_RELOAD_Pos             0U                                            /*!< SysTick LOAD: RELOAD Position */
+#define SysTick_LOAD_RELOAD_Msk            (0xFFFFFFUL /*<< SysTick_LOAD_RELOAD_Pos*/)    /*!< SysTick LOAD: RELOAD Mask */
+
+/* SysTick Current Register Definitions */
+#define SysTick_VAL_CURRENT_Pos             0U                                            /*!< SysTick VAL: CURRENT Position */
+#define SysTick_VAL_CURRENT_Msk            (0xFFFFFFUL /*<< SysTick_VAL_CURRENT_Pos*/)    /*!< SysTick VAL: CURRENT Mask */
+
+/* SysTick Calibration Register Definitions */
+#define SysTick_CALIB_NOREF_Pos            31U                                            /*!< SysTick CALIB: NOREF Position */
+#define SysTick_CALIB_NOREF_Msk            (1UL << SysTick_CALIB_NOREF_Pos)               /*!< SysTick CALIB: NOREF Mask */
+
+#define SysTick_CALIB_SKEW_Pos             30U                                            /*!< SysTick CALIB: SKEW Position */
+#define SysTick_CALIB_SKEW_Msk             (1UL << SysTick_CALIB_SKEW_Pos)                /*!< SysTick CALIB: SKEW Mask */
+
+#define SysTick_CALIB_TENMS_Pos             0U                                            /*!< SysTick CALIB: TENMS Position */
+#define SysTick_CALIB_TENMS_Msk            (0xFFFFFFUL /*<< SysTick_CALIB_TENMS_Pos*/)    /*!< SysTick CALIB: TENMS Mask */
+
+/*@} end of group CMSIS_SysTick */
+
+
+/**
+  \ingroup  CMSIS_core_register
+  \defgroup CMSIS_ITM     Instrumentation Trace Macrocell (ITM)
+  \brief    Type definitions for the Instrumentation Trace Macrocell (ITM)
+  @{
+ */
+
+/**
+  \brief  Structure type to access the Instrumentation Trace Macrocell Register (ITM).
+ */
+typedef struct
+{
+  __OM  union
+  {
+    __OM  uint8_t    u8;                 /*!< Offset: 0x000 ( /W)  ITM Stimulus Port 8-bit */
+    __OM  uint16_t   u16;                /*!< Offset: 0x000 ( /W)  ITM Stimulus Port 16-bit */
+    __OM  uint32_t   u32;                /*!< Offset: 0x000 ( /W)  ITM Stimulus Port 32-bit */
+  }  PORT [32U];                         /*!< Offset: 0x000 ( /W)  ITM Stimulus Port Registers */
+        uint32_t RESERVED0[864U];
+  __IOM uint32_t TER;                    /*!< Offset: 0xE00 (R/W)  ITM Trace Enable Register */
+        uint32_t RESERVED1[15U];
+  __IOM uint32_t TPR;                    /*!< Offset: 0xE40 (R/W)  ITM Trace Privilege Register */
+        uint32_t RESERVED2[15U];
+  __IOM uint32_t TCR;                    /*!< Offset: 0xE80 (R/W)  ITM Trace Control Register */
+        uint32_t RESERVED3[32U];
+        uint32_t RESERVED4[43U];
+  __OM  uint32_t LAR;                    /*!< Offset: 0xFB0 ( /W)  ITM Lock Access Register */
+  __IM  uint32_t LSR;                    /*!< Offset: 0xFB4 (R/ )  ITM Lock Status Register */
+        uint32_t RESERVED5[6U];
+  __IM  uint32_t PID4;                   /*!< Offset: 0xFD0 (R/ )  ITM Peripheral Identification Register #4 */
+  __IM  uint32_t PID5;                   /*!< Offset: 0xFD4 (R/ )  ITM Peripheral Identification Register #5 */
+  __IM  uint32_t PID6;                   /*!< Offset: 0xFD8 (R/ )  ITM Peripheral Identification Register #6 */
+  __IM  uint32_t PID7;                   /*!< Offset: 0xFDC (R/ )  ITM Peripheral Identification Register #7 */
+  __IM  uint32_t PID0;                   /*!< Offset: 0xFE0 (R/ )  ITM Peripheral Identification Register #0 */
+  __IM  uint32_t PID1;                   /*!< Offset: 0xFE4 (R/ )  ITM Peripheral Identification Register #1 */
+  __IM  uint32_t PID2;                   /*!< Offset: 0xFE8 (R/ )  ITM Peripheral Identification Register #2 */
+  __IM  uint32_t PID3;                   /*!< Offset: 0xFEC (R/ )  ITM Peripheral Identification Register #3 */
+  __IM  uint32_t CID0;                   /*!< Offset: 0xFF0 (R/ )  ITM Component  Identification Register #0 */
+  __IM  uint32_t CID1;                   /*!< Offset: 0xFF4 (R/ )  ITM Component  Identification Register #1 */
+  __IM  uint32_t CID2;                   /*!< Offset: 0xFF8 (R/ )  ITM Component  Identification Register #2 */
+  __IM  uint32_t CID3;                   /*!< Offset: 0xFFC (R/ )  ITM Component  Identification Register #3 */
+} ITM_Type;
+
+/* ITM Trace Privilege Register Definitions */
+#define ITM_TPR_PRIVMASK_Pos                0U                                            /*!< ITM TPR: PRIVMASK Position */
+#define ITM_TPR_PRIVMASK_Msk               (0xFFFFFFFFUL /*<< ITM_TPR_PRIVMASK_Pos*/)     /*!< ITM TPR: PRIVMASK Mask */
+
+/* ITM Trace Control Register Definitions */
+#define ITM_TCR_BUSY_Pos                   23U                                            /*!< ITM TCR: BUSY Position */
+#define ITM_TCR_BUSY_Msk                   (1UL << ITM_TCR_BUSY_Pos)                      /*!< ITM TCR: BUSY Mask */
+
+#define ITM_TCR_TraceBusID_Pos             16U                                            /*!< ITM TCR: ATBID Position */
+#define ITM_TCR_TraceBusID_Msk             (0x7FUL << ITM_TCR_TraceBusID_Pos)             /*!< ITM TCR: ATBID Mask */
+
+#define ITM_TCR_GTSFREQ_Pos                10U                                            /*!< ITM TCR: Global timestamp frequency Position */
+#define ITM_TCR_GTSFREQ_Msk                (3UL << ITM_TCR_GTSFREQ_Pos)                   /*!< ITM TCR: Global timestamp frequency Mask */
+
+#define ITM_TCR_TSPrescale_Pos              8U                                            /*!< ITM TCR: TSPrescale Position */
+#define ITM_TCR_TSPrescale_Msk             (3UL << ITM_TCR_TSPrescale_Pos)                /*!< ITM TCR: TSPrescale Mask */
+
+#define ITM_TCR_SWOENA_Pos                  4U                                            /*!< ITM TCR: SWOENA Position */
+#define ITM_TCR_SWOENA_Msk                 (1UL << ITM_TCR_SWOENA_Pos)                    /*!< ITM TCR: SWOENA Mask */
+
+#define ITM_TCR_DWTENA_Pos                  3U                                            /*!< ITM TCR: DWTENA Position */
+#define ITM_TCR_DWTENA_Msk                 (1UL << ITM_TCR_DWTENA_Pos)                    /*!< ITM TCR: DWTENA Mask */
+
+#define ITM_TCR_SYNCENA_Pos                 2U                                            /*!< ITM TCR: SYNCENA Position */
+#define ITM_TCR_SYNCENA_Msk                (1UL << ITM_TCR_SYNCENA_Pos)                   /*!< ITM TCR: SYNCENA Mask */
+
+#define ITM_TCR_TSENA_Pos                   1U                                            /*!< ITM TCR: TSENA Position */
+#define ITM_TCR_TSENA_Msk                  (1UL << ITM_TCR_TSENA_Pos)                     /*!< ITM TCR: TSENA Mask */
+
+#define ITM_TCR_ITMENA_Pos                  0U                                            /*!< ITM TCR: ITM Enable bit Position */
+#define ITM_TCR_ITMENA_Msk                 (1UL /*<< ITM_TCR_ITMENA_Pos*/)                /*!< ITM TCR: ITM Enable bit Mask */
+
+/* ITM Lock Status Register Definitions */
+#define ITM_LSR_ByteAcc_Pos                 2U                                            /*!< ITM LSR: ByteAcc Position */
+#define ITM_LSR_ByteAcc_Msk                (1UL << ITM_LSR_ByteAcc_Pos)                   /*!< ITM LSR: ByteAcc Mask */
+
+#define ITM_LSR_Access_Pos                  1U                                            /*!< ITM LSR: Access Position */
+#define ITM_LSR_Access_Msk                 (1UL << ITM_LSR_Access_Pos)                    /*!< ITM LSR: Access Mask */
+
+#define ITM_LSR_Present_Pos                 0U                                            /*!< ITM LSR: Present Position */
+#define ITM_LSR_Present_Msk                (1UL /*<< ITM_LSR_Present_Pos*/)               /*!< ITM LSR: Present Mask */
+
+/*@}*/ /* end of group CMSIS_ITM */
+
+
+/**
+  \ingroup  CMSIS_core_register
+  \defgroup CMSIS_DWT     Data Watchpoint and Trace (DWT)
+  \brief    Type definitions for the Data Watchpoint and Trace (DWT)
+  @{
+ */
+
+/**
+  \brief  Structure type to access the Data Watchpoint and Trace Register (DWT).
+ */
+typedef struct
+{
+  __IOM uint32_t CTRL;                   /*!< Offset: 0x000 (R/W)  Control Register */
+  __IOM uint32_t CYCCNT;                 /*!< Offset: 0x004 (R/W)  Cycle Count Register */
+  __IOM uint32_t CPICNT;                 /*!< Offset: 0x008 (R/W)  CPI Count Register */
+  __IOM uint32_t EXCCNT;                 /*!< Offset: 0x00C (R/W)  Exception Overhead Count Register */
+  __IOM uint32_t SLEEPCNT;               /*!< Offset: 0x010 (R/W)  Sleep Count Register */
+  __IOM uint32_t LSUCNT;                 /*!< Offset: 0x014 (R/W)  LSU Count Register */
+  __IOM uint32_t FOLDCNT;                /*!< Offset: 0x018 (R/W)  Folded-instruction Count Register */
+  __IM  uint32_t PCSR;                   /*!< Offset: 0x01C (R/ )  Program Counter Sample Register */
+  __IOM uint32_t COMP0;                  /*!< Offset: 0x020 (R/W)  Comparator Register 0 */
+  __IOM uint32_t MASK0;                  /*!< Offset: 0x024 (R/W)  Mask Register 0 */
+  __IOM uint32_t FUNCTION0;              /*!< Offset: 0x028 (R/W)  Function Register 0 */
+        uint32_t RESERVED0[1U];
+  __IOM uint32_t COMP1;                  /*!< Offset: 0x030 (R/W)  Comparator Register 1 */
+  __IOM uint32_t MASK1;                  /*!< Offset: 0x034 (R/W)  Mask Register 1 */
+  __IOM uint32_t FUNCTION1;              /*!< Offset: 0x038 (R/W)  Function Register 1 */
+        uint32_t RESERVED1[1U];
+  __IOM uint32_t COMP2;                  /*!< Offset: 0x040 (R/W)  Comparator Register 2 */
+  __IOM uint32_t MASK2;                  /*!< Offset: 0x044 (R/W)  Mask Register 2 */
+  __IOM uint32_t FUNCTION2;              /*!< Offset: 0x048 (R/W)  Function Register 2 */
+        uint32_t RESERVED2[1U];
+  __IOM uint32_t COMP3;                  /*!< Offset: 0x050 (R/W)  Comparator Register 3 */
+  __IOM uint32_t MASK3;                  /*!< Offset: 0x054 (R/W)  Mask Register 3 */
+  __IOM uint32_t FUNCTION3;              /*!< Offset: 0x058 (R/W)  Function Register 3 */
+} DWT_Type;
+
+/* DWT Control Register Definitions */
+#define DWT_CTRL_NUMCOMP_Pos               28U                                         /*!< DWT CTRL: NUMCOMP Position */
+#define DWT_CTRL_NUMCOMP_Msk               (0xFUL << DWT_CTRL_NUMCOMP_Pos)             /*!< DWT CTRL: NUMCOMP Mask */
+
+#define DWT_CTRL_NOTRCPKT_Pos              27U                                         /*!< DWT CTRL: NOTRCPKT Position */
+#define DWT_CTRL_NOTRCPKT_Msk              (0x1UL << DWT_CTRL_NOTRCPKT_Pos)            /*!< DWT CTRL: NOTRCPKT Mask */
+
+#define DWT_CTRL_NOEXTTRIG_Pos             26U                                         /*!< DWT CTRL: NOEXTTRIG Position */
+#define DWT_CTRL_NOEXTTRIG_Msk             (0x1UL << DWT_CTRL_NOEXTTRIG_Pos)           /*!< DWT CTRL: NOEXTTRIG Mask */
+
+#define DWT_CTRL_NOCYCCNT_Pos              25U                                         /*!< DWT CTRL: NOCYCCNT Position */
+#define DWT_CTRL_NOCYCCNT_Msk              (0x1UL << DWT_CTRL_NOCYCCNT_Pos)            /*!< DWT CTRL: NOCYCCNT Mask */
+
+#define DWT_CTRL_NOPRFCNT_Pos              24U                                         /*!< DWT CTRL: NOPRFCNT Position */
+#define DWT_CTRL_NOPRFCNT_Msk              (0x1UL << DWT_CTRL_NOPRFCNT_Pos)            /*!< DWT CTRL: NOPRFCNT Mask */
+
+#define DWT_CTRL_CYCEVTENA_Pos             22U                                         /*!< DWT CTRL: CYCEVTENA Position */
+#define DWT_CTRL_CYCEVTENA_Msk             (0x1UL << DWT_CTRL_CYCEVTENA_Pos)           /*!< DWT CTRL: CYCEVTENA Mask */
+
+#define DWT_CTRL_FOLDEVTENA_Pos            21U                                         /*!< DWT CTRL: FOLDEVTENA Position */
+#define DWT_CTRL_FOLDEVTENA_Msk            (0x1UL << DWT_CTRL_FOLDEVTENA_Pos)          /*!< DWT CTRL: FOLDEVTENA Mask */
+
+#define DWT_CTRL_LSUEVTENA_Pos             20U                                         /*!< DWT CTRL: LSUEVTENA Position */
+#define DWT_CTRL_LSUEVTENA_Msk             (0x1UL << DWT_CTRL_LSUEVTENA_Pos)           /*!< DWT CTRL: LSUEVTENA Mask */
+
+#define DWT_CTRL_SLEEPEVTENA_Pos           19U                                         /*!< DWT CTRL: SLEEPEVTENA Position */
+#define DWT_CTRL_SLEEPEVTENA_Msk           (0x1UL << DWT_CTRL_SLEEPEVTENA_Pos)         /*!< DWT CTRL: SLEEPEVTENA Mask */
+
+#define DWT_CTRL_EXCEVTENA_Pos             18U                                         /*!< DWT CTRL: EXCEVTENA Position */
+#define DWT_CTRL_EXCEVTENA_Msk             (0x1UL << DWT_CTRL_EXCEVTENA_Pos)           /*!< DWT CTRL: EXCEVTENA Mask */
+
+#define DWT_CTRL_CPIEVTENA_Pos             17U                                         /*!< DWT CTRL: CPIEVTENA Position */
+#define DWT_CTRL_CPIEVTENA_Msk             (0x1UL << DWT_CTRL_CPIEVTENA_Pos)           /*!< DWT CTRL: CPIEVTENA Mask */
+
+#define DWT_CTRL_EXCTRCENA_Pos             16U                                         /*!< DWT CTRL: EXCTRCENA Position */
+#define DWT_CTRL_EXCTRCENA_Msk             (0x1UL << DWT_CTRL_EXCTRCENA_Pos)           /*!< DWT CTRL: EXCTRCENA Mask */
+
+#define DWT_CTRL_PCSAMPLENA_Pos            12U                                         /*!< DWT CTRL: PCSAMPLENA Position */
+#define DWT_CTRL_PCSAMPLENA_Msk            (0x1UL << DWT_CTRL_PCSAMPLENA_Pos)          /*!< DWT CTRL: PCSAMPLENA Mask */
+
+#define DWT_CTRL_SYNCTAP_Pos               10U                                         /*!< DWT CTRL: SYNCTAP Position */
+#define DWT_CTRL_SYNCTAP_Msk               (0x3UL << DWT_CTRL_SYNCTAP_Pos)             /*!< DWT CTRL: SYNCTAP Mask */
+
+#define DWT_CTRL_CYCTAP_Pos                 9U                                         /*!< DWT CTRL: CYCTAP Position */
+#define DWT_CTRL_CYCTAP_Msk                (0x1UL << DWT_CTRL_CYCTAP_Pos)              /*!< DWT CTRL: CYCTAP Mask */
+
+#define DWT_CTRL_POSTINIT_Pos               5U                                         /*!< DWT CTRL: POSTINIT Position */
+#define DWT_CTRL_POSTINIT_Msk              (0xFUL << DWT_CTRL_POSTINIT_Pos)            /*!< DWT CTRL: POSTINIT Mask */
+
+#define DWT_CTRL_POSTPRESET_Pos             1U                                         /*!< DWT CTRL: POSTPRESET Position */
+#define DWT_CTRL_POSTPRESET_Msk            (0xFUL << DWT_CTRL_POSTPRESET_Pos)          /*!< DWT CTRL: POSTPRESET Mask */
+
+#define DWT_CTRL_CYCCNTENA_Pos              0U                                         /*!< DWT CTRL: CYCCNTENA Position */
+#define DWT_CTRL_CYCCNTENA_Msk             (0x1UL /*<< DWT_CTRL_CYCCNTENA_Pos*/)       /*!< DWT CTRL: CYCCNTENA Mask */
+
+/* DWT CPI Count Register Definitions */
+#define DWT_CPICNT_CPICNT_Pos               0U                                         /*!< DWT CPICNT: CPICNT Position */
+#define DWT_CPICNT_CPICNT_Msk              (0xFFUL /*<< DWT_CPICNT_CPICNT_Pos*/)       /*!< DWT CPICNT: CPICNT Mask */
+
+/* DWT Exception Overhead Count Register Definitions */
+#define DWT_EXCCNT_EXCCNT_Pos               0U                                         /*!< DWT EXCCNT: EXCCNT Position */
+#define DWT_EXCCNT_EXCCNT_Msk              (0xFFUL /*<< DWT_EXCCNT_EXCCNT_Pos*/)       /*!< DWT EXCCNT: EXCCNT Mask */
+
+/* DWT Sleep Count Register Definitions */
+#define DWT_SLEEPCNT_SLEEPCNT_Pos           0U                                         /*!< DWT SLEEPCNT: SLEEPCNT Position */
+#define DWT_SLEEPCNT_SLEEPCNT_Msk          (0xFFUL /*<< DWT_SLEEPCNT_SLEEPCNT_Pos*/)   /*!< DWT SLEEPCNT: SLEEPCNT Mask */
+
+/* DWT LSU Count Register Definitions */
+#define DWT_LSUCNT_LSUCNT_Pos               0U                                         /*!< DWT LSUCNT: LSUCNT Position */
+#define DWT_LSUCNT_LSUCNT_Msk              (0xFFUL /*<< DWT_LSUCNT_LSUCNT_Pos*/)       /*!< DWT LSUCNT: LSUCNT Mask */
+
+/* DWT Folded-instruction Count Register Definitions */
+#define DWT_FOLDCNT_FOLDCNT_Pos             0U                                         /*!< DWT FOLDCNT: FOLDCNT Position */
+#define DWT_FOLDCNT_FOLDCNT_Msk            (0xFFUL /*<< DWT_FOLDCNT_FOLDCNT_Pos*/)     /*!< DWT FOLDCNT: FOLDCNT Mask */
+
+/* DWT Comparator Mask Register Definitions */
+#define DWT_MASK_MASK_Pos                   0U                                         /*!< DWT MASK: MASK Position */
+#define DWT_MASK_MASK_Msk                  (0x1FUL /*<< DWT_MASK_MASK_Pos*/)           /*!< DWT MASK: MASK Mask */
+
+/* DWT Comparator Function Register Definitions */
+#define DWT_FUNCTION_MATCHED_Pos           24U                                         /*!< DWT FUNCTION: MATCHED Position */
+#define DWT_FUNCTION_MATCHED_Msk           (0x1UL << DWT_FUNCTION_MATCHED_Pos)         /*!< DWT FUNCTION: MATCHED Mask */
+
+#define DWT_FUNCTION_DATAVADDR1_Pos        16U                                         /*!< DWT FUNCTION: DATAVADDR1 Position */
+#define DWT_FUNCTION_DATAVADDR1_Msk        (0xFUL << DWT_FUNCTION_DATAVADDR1_Pos)      /*!< DWT FUNCTION: DATAVADDR1 Mask */
+
+#define DWT_FUNCTION_DATAVADDR0_Pos        12U                                         /*!< DWT FUNCTION: DATAVADDR0 Position */
+#define DWT_FUNCTION_DATAVADDR0_Msk        (0xFUL << DWT_FUNCTION_DATAVADDR0_Pos)      /*!< DWT FUNCTION: DATAVADDR0 Mask */
+
+#define DWT_FUNCTION_DATAVSIZE_Pos         10U                                         /*!< DWT FUNCTION: DATAVSIZE Position */
+#define DWT_FUNCTION_DATAVSIZE_Msk         (0x3UL << DWT_FUNCTION_DATAVSIZE_Pos)       /*!< DWT FUNCTION: DATAVSIZE Mask */
+
+#define DWT_FUNCTION_LNK1ENA_Pos            9U                                         /*!< DWT FUNCTION: LNK1ENA Position */
+#define DWT_FUNCTION_LNK1ENA_Msk           (0x1UL << DWT_FUNCTION_LNK1ENA_Pos)         /*!< DWT FUNCTION: LNK1ENA Mask */
+
+#define DWT_FUNCTION_DATAVMATCH_Pos         8U                                         /*!< DWT FUNCTION: DATAVMATCH Position */
+#define DWT_FUNCTION_DATAVMATCH_Msk        (0x1UL << DWT_FUNCTION_DATAVMATCH_Pos)      /*!< DWT FUNCTION: DATAVMATCH Mask */
+
+#define DWT_FUNCTION_CYCMATCH_Pos           7U                                         /*!< DWT FUNCTION: CYCMATCH Position */
+#define DWT_FUNCTION_CYCMATCH_Msk          (0x1UL << DWT_FUNCTION_CYCMATCH_Pos)        /*!< DWT FUNCTION: CYCMATCH Mask */
+
+#define DWT_FUNCTION_EMITRANGE_Pos          5U                                         /*!< DWT FUNCTION: EMITRANGE Position */
+#define DWT_FUNCTION_EMITRANGE_Msk         (0x1UL << DWT_FUNCTION_EMITRANGE_Pos)       /*!< DWT FUNCTION: EMITRANGE Mask */
+
+#define DWT_FUNCTION_FUNCTION_Pos           0U                                         /*!< DWT FUNCTION: FUNCTION Position */
+#define DWT_FUNCTION_FUNCTION_Msk          (0xFUL /*<< DWT_FUNCTION_FUNCTION_Pos*/)    /*!< DWT FUNCTION: FUNCTION Mask */
+
+/*@}*/ /* end of group CMSIS_DWT */
+
+
+/**
+  \ingroup  CMSIS_core_register
+  \defgroup CMSIS_TPI     Trace Port Interface (TPI)
+  \brief    Type definitions for the Trace Port Interface (TPI)
+  @{
+ */
+
+/**
+  \brief  Structure type to access the Trace Port Interface Register (TPI).
+ */
+typedef struct
+{
+  __IM  uint32_t SSPSR;                  /*!< Offset: 0x000 (R/ )  Supported Parallel Port Size Register */
+  __IOM uint32_t CSPSR;                  /*!< Offset: 0x004 (R/W)  Current Parallel Port Size Register */
+        uint32_t RESERVED0[2U];
+  __IOM uint32_t ACPR;                   /*!< Offset: 0x010 (R/W)  Asynchronous Clock Prescaler Register */
+        uint32_t RESERVED1[55U];
+  __IOM uint32_t SPPR;                   /*!< Offset: 0x0F0 (R/W)  Selected Pin Protocol Register */
+        uint32_t RESERVED2[131U];
+  __IM  uint32_t FFSR;                   /*!< Offset: 0x300 (R/ )  Formatter and Flush Status Register */
+  __IOM uint32_t FFCR;                   /*!< Offset: 0x304 (R/W)  Formatter and Flush Control Register */
+  __IM  uint32_t FSCR;                   /*!< Offset: 0x308 (R/ )  Formatter Synchronization Counter Register */
+        uint32_t RESERVED3[759U];
+  __IM  uint32_t TRIGGER;                /*!< Offset: 0xEE8 (R/ )  TRIGGER Register */
+  __IM  uint32_t FIFO0;                  /*!< Offset: 0xEEC (R/ )  Integration ETM Data */
+  __IM  uint32_t ITATBCTR2;              /*!< Offset: 0xEF0 (R/ )  ITATBCTR2 */
+        uint32_t RESERVED4[1U];
+  __IM  uint32_t ITATBCTR0;              /*!< Offset: 0xEF8 (R/ )  ITATBCTR0 */
+  __IM  uint32_t FIFO1;                  /*!< Offset: 0xEFC (R/ )  Integration ITM Data */
+  __IOM uint32_t ITCTRL;                 /*!< Offset: 0xF00 (R/W)  Integration Mode Control */
+        uint32_t RESERVED5[39U];
+  __IOM uint32_t CLAIMSET;               /*!< Offset: 0xFA0 (R/W)  Claim tag set */
+  __IOM uint32_t CLAIMCLR;               /*!< Offset: 0xFA4 (R/W)  Claim tag clear */
+        uint32_t RESERVED7[8U];
+  __IM  uint32_t DEVID;                  /*!< Offset: 0xFC8 (R/ )  TPIU_DEVID */
+  __IM  uint32_t DEVTYPE;                /*!< Offset: 0xFCC (R/ )  TPIU_DEVTYPE */
+} TPI_Type;
+
+/* TPI Asynchronous Clock Prescaler Register Definitions */
+#define TPI_ACPR_PRESCALER_Pos              0U                                         /*!< TPI ACPR: PRESCALER Position */
+#define TPI_ACPR_PRESCALER_Msk             (0x1FFFUL /*<< TPI_ACPR_PRESCALER_Pos*/)    /*!< TPI ACPR: PRESCALER Mask */
+
+/* TPI Selected Pin Protocol Register Definitions */
+#define TPI_SPPR_TXMODE_Pos                 0U                                         /*!< TPI SPPR: TXMODE Position */
+#define TPI_SPPR_TXMODE_Msk                (0x3UL /*<< TPI_SPPR_TXMODE_Pos*/)          /*!< TPI SPPR: TXMODE Mask */
+
+/* TPI Formatter and Flush Status Register Definitions */
+#define TPI_FFSR_FtNonStop_Pos              3U                                         /*!< TPI FFSR: FtNonStop Position */
+#define TPI_FFSR_FtNonStop_Msk             (0x1UL << TPI_FFSR_FtNonStop_Pos)           /*!< TPI FFSR: FtNonStop Mask */
+
+#define TPI_FFSR_TCPresent_Pos              2U                                         /*!< TPI FFSR: TCPresent Position */
+#define TPI_FFSR_TCPresent_Msk             (0x1UL << TPI_FFSR_TCPresent_Pos)           /*!< TPI FFSR: TCPresent Mask */
+
+#define TPI_FFSR_FtStopped_Pos              1U                                         /*!< TPI FFSR: FtStopped Position */
+#define TPI_FFSR_FtStopped_Msk             (0x1UL << TPI_FFSR_FtStopped_Pos)           /*!< TPI FFSR: FtStopped Mask */
+
+#define TPI_FFSR_FlInProg_Pos               0U                                         /*!< TPI FFSR: FlInProg Position */
+#define TPI_FFSR_FlInProg_Msk              (0x1UL /*<< TPI_FFSR_FlInProg_Pos*/)        /*!< TPI FFSR: FlInProg Mask */
+
+/* TPI Formatter and Flush Control Register Definitions */
+#define TPI_FFCR_TrigIn_Pos                 8U                                         /*!< TPI FFCR: TrigIn Position */
+#define TPI_FFCR_TrigIn_Msk                (0x1UL << TPI_FFCR_TrigIn_Pos)              /*!< TPI FFCR: TrigIn Mask */
+
+#define TPI_FFCR_EnFCont_Pos                1U                                         /*!< TPI FFCR: EnFCont Position */
+#define TPI_FFCR_EnFCont_Msk               (0x1UL << TPI_FFCR_EnFCont_Pos)             /*!< TPI FFCR: EnFCont Mask */
+
+/* TPI TRIGGER Register Definitions */
+#define TPI_TRIGGER_TRIGGER_Pos             0U                                         /*!< TPI TRIGGER: TRIGGER Position */
+#define TPI_TRIGGER_TRIGGER_Msk            (0x1UL /*<< TPI_TRIGGER_TRIGGER_Pos*/)      /*!< TPI TRIGGER: TRIGGER Mask */
+
+/* TPI Integration ETM Data Register Definitions (FIFO0) */
+#define TPI_FIFO0_ITM_ATVALID_Pos          29U                                         /*!< TPI FIFO0: ITM_ATVALID Position */
+#define TPI_FIFO0_ITM_ATVALID_Msk          (0x1UL << TPI_FIFO0_ITM_ATVALID_Pos)        /*!< TPI FIFO0: ITM_ATVALID Mask */
+
+#define TPI_FIFO0_ITM_bytecount_Pos        27U                                         /*!< TPI FIFO0: ITM_bytecount Position */
+#define TPI_FIFO0_ITM_bytecount_Msk        (0x3UL << TPI_FIFO0_ITM_bytecount_Pos)      /*!< TPI FIFO0: ITM_bytecount Mask */
+
+#define TPI_FIFO0_ETM_ATVALID_Pos          26U                                         /*!< TPI FIFO0: ETM_ATVALID Position */
+#define TPI_FIFO0_ETM_ATVALID_Msk          (0x1UL << TPI_FIFO0_ETM_ATVALID_Pos)        /*!< TPI FIFO0: ETM_ATVALID Mask */
+
+#define TPI_FIFO0_ETM_bytecount_Pos        24U                                         /*!< TPI FIFO0: ETM_bytecount Position */
+#define TPI_FIFO0_ETM_bytecount_Msk        (0x3UL << TPI_FIFO0_ETM_bytecount_Pos)      /*!< TPI FIFO0: ETM_bytecount Mask */
+
+#define TPI_FIFO0_ETM2_Pos                 16U                                         /*!< TPI FIFO0: ETM2 Position */
+#define TPI_FIFO0_ETM2_Msk                 (0xFFUL << TPI_FIFO0_ETM2_Pos)              /*!< TPI FIFO0: ETM2 Mask */
+
+#define TPI_FIFO0_ETM1_Pos                  8U                                         /*!< TPI FIFO0: ETM1 Position */
+#define TPI_FIFO0_ETM1_Msk                 (0xFFUL << TPI_FIFO0_ETM1_Pos)              /*!< TPI FIFO0: ETM1 Mask */
+
+#define TPI_FIFO0_ETM0_Pos                  0U                                         /*!< TPI FIFO0: ETM0 Position */
+#define TPI_FIFO0_ETM0_Msk                 (0xFFUL /*<< TPI_FIFO0_ETM0_Pos*/)          /*!< TPI FIFO0: ETM0 Mask */
+
+/* TPI ITATBCTR2 Register Definitions */
+#define TPI_ITATBCTR2_ATREADY2_Pos          0U                                         /*!< TPI ITATBCTR2: ATREADY2 Position */
+#define TPI_ITATBCTR2_ATREADY2_Msk         (0x1UL /*<< TPI_ITATBCTR2_ATREADY2_Pos*/)   /*!< TPI ITATBCTR2: ATREADY2 Mask */
+
+#define TPI_ITATBCTR2_ATREADY1_Pos          0U                                         /*!< TPI ITATBCTR2: ATREADY1 Position */
+#define TPI_ITATBCTR2_ATREADY1_Msk         (0x1UL /*<< TPI_ITATBCTR2_ATREADY1_Pos*/)   /*!< TPI ITATBCTR2: ATREADY1 Mask */
+
+/* TPI Integration ITM Data Register Definitions (FIFO1) */
+#define TPI_FIFO1_ITM_ATVALID_Pos          29U                                         /*!< TPI FIFO1: ITM_ATVALID Position */
+#define TPI_FIFO1_ITM_ATVALID_Msk          (0x1UL << TPI_FIFO1_ITM_ATVALID_Pos)        /*!< TPI FIFO1: ITM_ATVALID Mask */
+
+#define TPI_FIFO1_ITM_bytecount_Pos        27U                                         /*!< TPI FIFO1: ITM_bytecount Position */
+#define TPI_FIFO1_ITM_bytecount_Msk        (0x3UL << TPI_FIFO1_ITM_bytecount_Pos)      /*!< TPI FIFO1: ITM_bytecount Mask */
+
+#define TPI_FIFO1_ETM_ATVALID_Pos          26U                                         /*!< TPI FIFO1: ETM_ATVALID Position */
+#define TPI_FIFO1_ETM_ATVALID_Msk          (0x1UL << TPI_FIFO1_ETM_ATVALID_Pos)        /*!< TPI FIFO1: ETM_ATVALID Mask */
+
+#define TPI_FIFO1_ETM_bytecount_Pos        24U                                         /*!< TPI FIFO1: ETM_bytecount Position */
+#define TPI_FIFO1_ETM_bytecount_Msk        (0x3UL << TPI_FIFO1_ETM_bytecount_Pos)      /*!< TPI FIFO1: ETM_bytecount Mask */
+
+#define TPI_FIFO1_ITM2_Pos                 16U                                         /*!< TPI FIFO1: ITM2 Position */
+#define TPI_FIFO1_ITM2_Msk                 (0xFFUL << TPI_FIFO1_ITM2_Pos)              /*!< TPI FIFO1: ITM2 Mask */
+
+#define TPI_FIFO1_ITM1_Pos                  8U                                         /*!< TPI FIFO1: ITM1 Position */
+#define TPI_FIFO1_ITM1_Msk                 (0xFFUL << TPI_FIFO1_ITM1_Pos)              /*!< TPI FIFO1: ITM1 Mask */
+
+#define TPI_FIFO1_ITM0_Pos                  0U                                         /*!< TPI FIFO1: ITM0 Position */
+#define TPI_FIFO1_ITM0_Msk                 (0xFFUL /*<< TPI_FIFO1_ITM0_Pos*/)          /*!< TPI FIFO1: ITM0 Mask */
+
+/* TPI ITATBCTR0 Register Definitions */
+#define TPI_ITATBCTR0_ATREADY2_Pos          0U                                         /*!< TPI ITATBCTR0: ATREADY2 Position */
+#define TPI_ITATBCTR0_ATREADY2_Msk         (0x1UL /*<< TPI_ITATBCTR0_ATREADY2_Pos*/)   /*!< TPI ITATBCTR0: ATREADY2 Mask */
+
+#define TPI_ITATBCTR0_ATREADY1_Pos          0U                                         /*!< TPI ITATBCTR0: ATREADY1 Position */
+#define TPI_ITATBCTR0_ATREADY1_Msk         (0x1UL /*<< TPI_ITATBCTR0_ATREADY1_Pos*/)   /*!< TPI ITATBCTR0: ATREADY1 Mask */
+
+/* TPI Integration Mode Control Register Definitions */
+#define TPI_ITCTRL_Mode_Pos                 0U                                         /*!< TPI ITCTRL: Mode Position */
+#define TPI_ITCTRL_Mode_Msk                (0x3UL /*<< TPI_ITCTRL_Mode_Pos*/)          /*!< TPI ITCTRL: Mode Mask */
+
+/* TPI DEVID Register Definitions */
+#define TPI_DEVID_NRZVALID_Pos             11U                                         /*!< TPI DEVID: NRZVALID Position */
+#define TPI_DEVID_NRZVALID_Msk             (0x1UL << TPI_DEVID_NRZVALID_Pos)           /*!< TPI DEVID: NRZVALID Mask */
+
+#define TPI_DEVID_MANCVALID_Pos            10U                                         /*!< TPI DEVID: MANCVALID Position */
+#define TPI_DEVID_MANCVALID_Msk            (0x1UL << TPI_DEVID_MANCVALID_Pos)          /*!< TPI DEVID: MANCVALID Mask */
+
+#define TPI_DEVID_PTINVALID_Pos             9U                                         /*!< TPI DEVID: PTINVALID Position */
+#define TPI_DEVID_PTINVALID_Msk            (0x1UL << TPI_DEVID_PTINVALID_Pos)          /*!< TPI DEVID: PTINVALID Mask */
+
+#define TPI_DEVID_MinBufSz_Pos              6U                                         /*!< TPI DEVID: MinBufSz Position */
+#define TPI_DEVID_MinBufSz_Msk             (0x7UL << TPI_DEVID_MinBufSz_Pos)           /*!< TPI DEVID: MinBufSz Mask */
+
+#define TPI_DEVID_AsynClkIn_Pos             5U                                         /*!< TPI DEVID: AsynClkIn Position */
+#define TPI_DEVID_AsynClkIn_Msk            (0x1UL << TPI_DEVID_AsynClkIn_Pos)          /*!< TPI DEVID: AsynClkIn Mask */
+
+#define TPI_DEVID_NrTraceInput_Pos          0U                                         /*!< TPI DEVID: NrTraceInput Position */
+#define TPI_DEVID_NrTraceInput_Msk         (0x1FUL /*<< TPI_DEVID_NrTraceInput_Pos*/)  /*!< TPI DEVID: NrTraceInput Mask */
+
+/* TPI DEVTYPE Register Definitions */
+#define TPI_DEVTYPE_SubType_Pos             4U                                         /*!< TPI DEVTYPE: SubType Position */
+#define TPI_DEVTYPE_SubType_Msk            (0xFUL /*<< TPI_DEVTYPE_SubType_Pos*/)      /*!< TPI DEVTYPE: SubType Mask */
+
+#define TPI_DEVTYPE_MajorType_Pos           0U                                         /*!< TPI DEVTYPE: MajorType Position */
+#define TPI_DEVTYPE_MajorType_Msk          (0xFUL << TPI_DEVTYPE_MajorType_Pos)        /*!< TPI DEVTYPE: MajorType Mask */
+
+/*@}*/ /* end of group CMSIS_TPI */
+
+
+#if defined (__MPU_PRESENT) && (__MPU_PRESENT == 1U)
+/**
+  \ingroup  CMSIS_core_register
+  \defgroup CMSIS_MPU     Memory Protection Unit (MPU)
+  \brief    Type definitions for the Memory Protection Unit (MPU)
+  @{
+ */
+
+/**
+  \brief  Structure type to access the Memory Protection Unit (MPU).
+ */
+typedef struct
+{
+  __IM  uint32_t TYPE;                   /*!< Offset: 0x000 (R/ )  MPU Type Register */
+  __IOM uint32_t CTRL;                   /*!< Offset: 0x004 (R/W)  MPU Control Register */
+  __IOM uint32_t RNR;                    /*!< Offset: 0x008 (R/W)  MPU Region RNRber Register */
+  __IOM uint32_t RBAR;                   /*!< Offset: 0x00C (R/W)  MPU Region Base Address Register */
+  __IOM uint32_t RASR;                   /*!< Offset: 0x010 (R/W)  MPU Region Attribute and Size Register */
+  __IOM uint32_t RBAR_A1;                /*!< Offset: 0x014 (R/W)  MPU Alias 1 Region Base Address Register */
+  __IOM uint32_t RASR_A1;                /*!< Offset: 0x018 (R/W)  MPU Alias 1 Region Attribute and Size Register */
+  __IOM uint32_t RBAR_A2;                /*!< Offset: 0x01C (R/W)  MPU Alias 2 Region Base Address Register */
+  __IOM uint32_t RASR_A2;                /*!< Offset: 0x020 (R/W)  MPU Alias 2 Region Attribute and Size Register */
+  __IOM uint32_t RBAR_A3;                /*!< Offset: 0x024 (R/W)  MPU Alias 3 Region Base Address Register */
+  __IOM uint32_t RASR_A3;                /*!< Offset: 0x028 (R/W)  MPU Alias 3 Region Attribute and Size Register */
+} MPU_Type;
+
+#define MPU_TYPE_RALIASES                  4U
+
+/* MPU Type Register Definitions */
+#define MPU_TYPE_IREGION_Pos               16U                                            /*!< MPU TYPE: IREGION Position */
+#define MPU_TYPE_IREGION_Msk               (0xFFUL << MPU_TYPE_IREGION_Pos)               /*!< MPU TYPE: IREGION Mask */
+
+#define MPU_TYPE_DREGION_Pos                8U                                            /*!< MPU TYPE: DREGION Position */
+#define MPU_TYPE_DREGION_Msk               (0xFFUL << MPU_TYPE_DREGION_Pos)               /*!< MPU TYPE: DREGION Mask */
+
+#define MPU_TYPE_SEPARATE_Pos               0U                                            /*!< MPU TYPE: SEPARATE Position */
+#define MPU_TYPE_SEPARATE_Msk              (1UL /*<< MPU_TYPE_SEPARATE_Pos*/)             /*!< MPU TYPE: SEPARATE Mask */
+
+/* MPU Control Register Definitions */
+#define MPU_CTRL_PRIVDEFENA_Pos             2U                                            /*!< MPU CTRL: PRIVDEFENA Position */
+#define MPU_CTRL_PRIVDEFENA_Msk            (1UL << MPU_CTRL_PRIVDEFENA_Pos)               /*!< MPU CTRL: PRIVDEFENA Mask */
+
+#define MPU_CTRL_HFNMIENA_Pos               1U                                            /*!< MPU CTRL: HFNMIENA Position */
+#define MPU_CTRL_HFNMIENA_Msk              (1UL << MPU_CTRL_HFNMIENA_Pos)                 /*!< MPU CTRL: HFNMIENA Mask */
+
+#define MPU_CTRL_ENABLE_Pos                 0U                                            /*!< MPU CTRL: ENABLE Position */
+#define MPU_CTRL_ENABLE_Msk                (1UL /*<< MPU_CTRL_ENABLE_Pos*/)               /*!< MPU CTRL: ENABLE Mask */
+
+/* MPU Region Number Register Definitions */
+#define MPU_RNR_REGION_Pos                  0U                                            /*!< MPU RNR: REGION Position */
+#define MPU_RNR_REGION_Msk                 (0xFFUL /*<< MPU_RNR_REGION_Pos*/)             /*!< MPU RNR: REGION Mask */
+
+/* MPU Region Base Address Register Definitions */
+#define MPU_RBAR_ADDR_Pos                   5U                                            /*!< MPU RBAR: ADDR Position */
+#define MPU_RBAR_ADDR_Msk                  (0x7FFFFFFUL << MPU_RBAR_ADDR_Pos)             /*!< MPU RBAR: ADDR Mask */
+
+#define MPU_RBAR_VALID_Pos                  4U                                            /*!< MPU RBAR: VALID Position */
+#define MPU_RBAR_VALID_Msk                 (1UL << MPU_RBAR_VALID_Pos)                    /*!< MPU RBAR: VALID Mask */
+
+#define MPU_RBAR_REGION_Pos                 0U                                            /*!< MPU RBAR: REGION Position */
+#define MPU_RBAR_REGION_Msk                (0xFUL /*<< MPU_RBAR_REGION_Pos*/)             /*!< MPU RBAR: REGION Mask */
+
+/* MPU Region Attribute and Size Register Definitions */
+#define MPU_RASR_ATTRS_Pos                 16U                                            /*!< MPU RASR: MPU Region Attribute field Position */
+#define MPU_RASR_ATTRS_Msk                 (0xFFFFUL << MPU_RASR_ATTRS_Pos)               /*!< MPU RASR: MPU Region Attribute field Mask */
+
+#define MPU_RASR_XN_Pos                    28U                                            /*!< MPU RASR: ATTRS.XN Position */
+#define MPU_RASR_XN_Msk                    (1UL << MPU_RASR_XN_Pos)                       /*!< MPU RASR: ATTRS.XN Mask */
+
+#define MPU_RASR_AP_Pos                    24U                                            /*!< MPU RASR: ATTRS.AP Position */
+#define MPU_RASR_AP_Msk                    (0x7UL << MPU_RASR_AP_Pos)                     /*!< MPU RASR: ATTRS.AP Mask */
+
+#define MPU_RASR_TEX_Pos                   19U                                            /*!< MPU RASR: ATTRS.TEX Position */
+#define MPU_RASR_TEX_Msk                   (0x7UL << MPU_RASR_TEX_Pos)                    /*!< MPU RASR: ATTRS.TEX Mask */
+
+#define MPU_RASR_S_Pos                     18U                                            /*!< MPU RASR: ATTRS.S Position */
+#define MPU_RASR_S_Msk                     (1UL << MPU_RASR_S_Pos)                        /*!< MPU RASR: ATTRS.S Mask */
+
+#define MPU_RASR_C_Pos                     17U                                            /*!< MPU RASR: ATTRS.C Position */
+#define MPU_RASR_C_Msk                     (1UL << MPU_RASR_C_Pos)                        /*!< MPU RASR: ATTRS.C Mask */
+
+#define MPU_RASR_B_Pos                     16U                                            /*!< MPU RASR: ATTRS.B Position */
+#define MPU_RASR_B_Msk                     (1UL << MPU_RASR_B_Pos)                        /*!< MPU RASR: ATTRS.B Mask */
+
+#define MPU_RASR_SRD_Pos                    8U                                            /*!< MPU RASR: Sub-Region Disable Position */
+#define MPU_RASR_SRD_Msk                   (0xFFUL << MPU_RASR_SRD_Pos)                   /*!< MPU RASR: Sub-Region Disable Mask */
+
+#define MPU_RASR_SIZE_Pos                   1U                                            /*!< MPU RASR: Region Size Field Position */
+#define MPU_RASR_SIZE_Msk                  (0x1FUL << MPU_RASR_SIZE_Pos)                  /*!< MPU RASR: Region Size Field Mask */
+
+#define MPU_RASR_ENABLE_Pos                 0U                                            /*!< MPU RASR: Region enable bit Position */
+#define MPU_RASR_ENABLE_Msk                (1UL /*<< MPU_RASR_ENABLE_Pos*/)               /*!< MPU RASR: Region enable bit Disable Mask */
+
+/*@} end of group CMSIS_MPU */
+#endif /* defined (__MPU_PRESENT) && (__MPU_PRESENT == 1U) */
+
+
+/**
+  \ingroup  CMSIS_core_register
+  \defgroup CMSIS_FPU     Floating Point Unit (FPU)
+  \brief    Type definitions for the Floating Point Unit (FPU)
+  @{
+ */
+
+/**
+  \brief  Structure type to access the Floating Point Unit (FPU).
+ */
+typedef struct
+{
+        uint32_t RESERVED0[1U];
+  __IOM uint32_t FPCCR;                  /*!< Offset: 0x004 (R/W)  Floating-Point Context Control Register */
+  __IOM uint32_t FPCAR;                  /*!< Offset: 0x008 (R/W)  Floating-Point Context Address Register */
+  __IOM uint32_t FPDSCR;                 /*!< Offset: 0x00C (R/W)  Floating-Point Default Status Control Register */
+  __IM  uint32_t MVFR0;                  /*!< Offset: 0x010 (R/ )  Media and FP Feature Register 0 */
+  __IM  uint32_t MVFR1;                  /*!< Offset: 0x014 (R/ )  Media and FP Feature Register 1 */
+  __IM  uint32_t MVFR2;                  /*!< Offset: 0x018 (R/ )  Media and FP Feature Register 2 */
+} FPU_Type;
+
+/* Floating-Point Context Control Register Definitions */
+#define FPU_FPCCR_ASPEN_Pos                31U                                            /*!< FPCCR: ASPEN bit Position */
+#define FPU_FPCCR_ASPEN_Msk                (1UL << FPU_FPCCR_ASPEN_Pos)                   /*!< FPCCR: ASPEN bit Mask */
+
+#define FPU_FPCCR_LSPEN_Pos                30U                                            /*!< FPCCR: LSPEN Position */
+#define FPU_FPCCR_LSPEN_Msk                (1UL << FPU_FPCCR_LSPEN_Pos)                   /*!< FPCCR: LSPEN bit Mask */
+
+#define FPU_FPCCR_MONRDY_Pos                8U                                            /*!< FPCCR: MONRDY Position */
+#define FPU_FPCCR_MONRDY_Msk               (1UL << FPU_FPCCR_MONRDY_Pos)                  /*!< FPCCR: MONRDY bit Mask */
+
+#define FPU_FPCCR_BFRDY_Pos                 6U                                            /*!< FPCCR: BFRDY Position */
+#define FPU_FPCCR_BFRDY_Msk                (1UL << FPU_FPCCR_BFRDY_Pos)                   /*!< FPCCR: BFRDY bit Mask */
+
+#define FPU_FPCCR_MMRDY_Pos                 5U                                            /*!< FPCCR: MMRDY Position */
+#define FPU_FPCCR_MMRDY_Msk                (1UL << FPU_FPCCR_MMRDY_Pos)                   /*!< FPCCR: MMRDY bit Mask */
+
+#define FPU_FPCCR_HFRDY_Pos                 4U                                            /*!< FPCCR: HFRDY Position */
+#define FPU_FPCCR_HFRDY_Msk                (1UL << FPU_FPCCR_HFRDY_Pos)                   /*!< FPCCR: HFRDY bit Mask */
+
+#define FPU_FPCCR_THREAD_Pos                3U                                            /*!< FPCCR: processor mode bit Position */
+#define FPU_FPCCR_THREAD_Msk               (1UL << FPU_FPCCR_THREAD_Pos)                  /*!< FPCCR: processor mode active bit Mask */
+
+#define FPU_FPCCR_USER_Pos                  1U                                            /*!< FPCCR: privilege level bit Position */
+#define FPU_FPCCR_USER_Msk                 (1UL << FPU_FPCCR_USER_Pos)                    /*!< FPCCR: privilege level bit Mask */
+
+#define FPU_FPCCR_LSPACT_Pos                0U                                            /*!< FPCCR: Lazy state preservation active bit Position */
+#define FPU_FPCCR_LSPACT_Msk               (1UL /*<< FPU_FPCCR_LSPACT_Pos*/)              /*!< FPCCR: Lazy state preservation active bit Mask */
+
+/* Floating-Point Context Address Register Definitions */
+#define FPU_FPCAR_ADDRESS_Pos               3U                                            /*!< FPCAR: ADDRESS bit Position */
+#define FPU_FPCAR_ADDRESS_Msk              (0x1FFFFFFFUL << FPU_FPCAR_ADDRESS_Pos)        /*!< FPCAR: ADDRESS bit Mask */
+
+/* Floating-Point Default Status Control Register Definitions */
+#define FPU_FPDSCR_AHP_Pos                 26U                                            /*!< FPDSCR: AHP bit Position */
+#define FPU_FPDSCR_AHP_Msk                 (1UL << FPU_FPDSCR_AHP_Pos)                    /*!< FPDSCR: AHP bit Mask */
+
+#define FPU_FPDSCR_DN_Pos                  25U                                            /*!< FPDSCR: DN bit Position */
+#define FPU_FPDSCR_DN_Msk                  (1UL << FPU_FPDSCR_DN_Pos)                     /*!< FPDSCR: DN bit Mask */
+
+#define FPU_FPDSCR_FZ_Pos                  24U                                            /*!< FPDSCR: FZ bit Position */
+#define FPU_FPDSCR_FZ_Msk                  (1UL << FPU_FPDSCR_FZ_Pos)                     /*!< FPDSCR: FZ bit Mask */
+
+#define FPU_FPDSCR_RMode_Pos               22U                                            /*!< FPDSCR: RMode bit Position */
+#define FPU_FPDSCR_RMode_Msk               (3UL << FPU_FPDSCR_RMode_Pos)                  /*!< FPDSCR: RMode bit Mask */
+
+/* Media and FP Feature Register 0 Definitions */
+#define FPU_MVFR0_FP_rounding_modes_Pos    28U                                            /*!< MVFR0: FP rounding modes bits Position */
+#define FPU_MVFR0_FP_rounding_modes_Msk    (0xFUL << FPU_MVFR0_FP_rounding_modes_Pos)     /*!< MVFR0: FP rounding modes bits Mask */
+
+#define FPU_MVFR0_Short_vectors_Pos        24U                                            /*!< MVFR0: Short vectors bits Position */
+#define FPU_MVFR0_Short_vectors_Msk        (0xFUL << FPU_MVFR0_Short_vectors_Pos)         /*!< MVFR0: Short vectors bits Mask */
+
+#define FPU_MVFR0_Square_root_Pos          20U                                            /*!< MVFR0: Square root bits Position */
+#define FPU_MVFR0_Square_root_Msk          (0xFUL << FPU_MVFR0_Square_root_Pos)           /*!< MVFR0: Square root bits Mask */
+
+#define FPU_MVFR0_Divide_Pos               16U                                            /*!< MVFR0: Divide bits Position */
+#define FPU_MVFR0_Divide_Msk               (0xFUL << FPU_MVFR0_Divide_Pos)                /*!< MVFR0: Divide bits Mask */
+
+#define FPU_MVFR0_FP_excep_trapping_Pos    12U                                            /*!< MVFR0: FP exception trapping bits Position */
+#define FPU_MVFR0_FP_excep_trapping_Msk    (0xFUL << FPU_MVFR0_FP_excep_trapping_Pos)     /*!< MVFR0: FP exception trapping bits Mask */
+
+#define FPU_MVFR0_Double_precision_Pos      8U                                            /*!< MVFR0: Double-precision bits Position */
+#define FPU_MVFR0_Double_precision_Msk     (0xFUL << FPU_MVFR0_Double_precision_Pos)      /*!< MVFR0: Double-precision bits Mask */
+
+#define FPU_MVFR0_Single_precision_Pos      4U                                            /*!< MVFR0: Single-precision bits Position */
+#define FPU_MVFR0_Single_precision_Msk     (0xFUL << FPU_MVFR0_Single_precision_Pos)      /*!< MVFR0: Single-precision bits Mask */
+
+#define FPU_MVFR0_A_SIMD_registers_Pos      0U                                            /*!< MVFR0: A_SIMD registers bits Position */
+#define FPU_MVFR0_A_SIMD_registers_Msk     (0xFUL /*<< FPU_MVFR0_A_SIMD_registers_Pos*/)  /*!< MVFR0: A_SIMD registers bits Mask */
+
+/* Media and FP Feature Register 1 Definitions */
+#define FPU_MVFR1_FP_fused_MAC_Pos         28U                                            /*!< MVFR1: FP fused MAC bits Position */
+#define FPU_MVFR1_FP_fused_MAC_Msk         (0xFUL << FPU_MVFR1_FP_fused_MAC_Pos)          /*!< MVFR1: FP fused MAC bits Mask */
+
+#define FPU_MVFR1_FP_HPFP_Pos              24U                                            /*!< MVFR1: FP HPFP bits Position */
+#define FPU_MVFR1_FP_HPFP_Msk              (0xFUL << FPU_MVFR1_FP_HPFP_Pos)               /*!< MVFR1: FP HPFP bits Mask */
+
+#define FPU_MVFR1_D_NaN_mode_Pos            4U                                            /*!< MVFR1: D_NaN mode bits Position */
+#define FPU_MVFR1_D_NaN_mode_Msk           (0xFUL << FPU_MVFR1_D_NaN_mode_Pos)            /*!< MVFR1: D_NaN mode bits Mask */
+
+#define FPU_MVFR1_FtZ_mode_Pos              0U                                            /*!< MVFR1: FtZ mode bits Position */
+#define FPU_MVFR1_FtZ_mode_Msk             (0xFUL /*<< FPU_MVFR1_FtZ_mode_Pos*/)          /*!< MVFR1: FtZ mode bits Mask */
+
+/* Media and FP Feature Register 2 Definitions */
+
+#define FPU_MVFR2_VFP_Misc_Pos              4U                                            /*!< MVFR2: VFP Misc bits Position */
+#define FPU_MVFR2_VFP_Misc_Msk             (0xFUL << FPU_MVFR2_VFP_Misc_Pos)              /*!< MVFR2: VFP Misc bits Mask */
+
+/*@} end of group CMSIS_FPU */
+
+
+/**
+  \ingroup  CMSIS_core_register
+  \defgroup CMSIS_CoreDebug       Core Debug Registers (CoreDebug)
+  \brief    Type definitions for the Core Debug Registers
+  @{
+ */
+
+/**
+  \brief  Structure type to access the Core Debug Register (CoreDebug).
+ */
+typedef struct
+{
+  __IOM uint32_t DHCSR;                  /*!< Offset: 0x000 (R/W)  Debug Halting Control and Status Register */
+  __OM  uint32_t DCRSR;                  /*!< Offset: 0x004 ( /W)  Debug Core Register Selector Register */
+  __IOM uint32_t DCRDR;                  /*!< Offset: 0x008 (R/W)  Debug Core Register Data Register */
+  __IOM uint32_t DEMCR;                  /*!< Offset: 0x00C (R/W)  Debug Exception and Monitor Control Register */
+} CoreDebug_Type;
+
+/* Debug Halting Control and Status Register Definitions */
+#define CoreDebug_DHCSR_DBGKEY_Pos         16U                                            /*!< CoreDebug DHCSR: DBGKEY Position */
+#define CoreDebug_DHCSR_DBGKEY_Msk         (0xFFFFUL << CoreDebug_DHCSR_DBGKEY_Pos)       /*!< CoreDebug DHCSR: DBGKEY Mask */
+
+#define CoreDebug_DHCSR_S_RESET_ST_Pos     25U                                            /*!< CoreDebug DHCSR: S_RESET_ST Position */
+#define CoreDebug_DHCSR_S_RESET_ST_Msk     (1UL << CoreDebug_DHCSR_S_RESET_ST_Pos)        /*!< CoreDebug DHCSR: S_RESET_ST Mask */
+
+#define CoreDebug_DHCSR_S_RETIRE_ST_Pos    24U                                            /*!< CoreDebug DHCSR: S_RETIRE_ST Position */
+#define CoreDebug_DHCSR_S_RETIRE_ST_Msk    (1UL << CoreDebug_DHCSR_S_RETIRE_ST_Pos)       /*!< CoreDebug DHCSR: S_RETIRE_ST Mask */
+
+#define CoreDebug_DHCSR_S_LOCKUP_Pos       19U                                            /*!< CoreDebug DHCSR: S_LOCKUP Position */
+#define CoreDebug_DHCSR_S_LOCKUP_Msk       (1UL << CoreDebug_DHCSR_S_LOCKUP_Pos)          /*!< CoreDebug DHCSR: S_LOCKUP Mask */
+
+#define CoreDebug_DHCSR_S_SLEEP_Pos        18U                                            /*!< CoreDebug DHCSR: S_SLEEP Position */
+#define CoreDebug_DHCSR_S_SLEEP_Msk        (1UL << CoreDebug_DHCSR_S_SLEEP_Pos)           /*!< CoreDebug DHCSR: S_SLEEP Mask */
+
+#define CoreDebug_DHCSR_S_HALT_Pos         17U                                            /*!< CoreDebug DHCSR: S_HALT Position */
+#define CoreDebug_DHCSR_S_HALT_Msk         (1UL << CoreDebug_DHCSR_S_HALT_Pos)            /*!< CoreDebug DHCSR: S_HALT Mask */
+
+#define CoreDebug_DHCSR_S_REGRDY_Pos       16U                                            /*!< CoreDebug DHCSR: S_REGRDY Position */
+#define CoreDebug_DHCSR_S_REGRDY_Msk       (1UL << CoreDebug_DHCSR_S_REGRDY_Pos)          /*!< CoreDebug DHCSR: S_REGRDY Mask */
+
+#define CoreDebug_DHCSR_C_SNAPSTALL_Pos     5U                                            /*!< CoreDebug DHCSR: C_SNAPSTALL Position */
+#define CoreDebug_DHCSR_C_SNAPSTALL_Msk    (1UL << CoreDebug_DHCSR_C_SNAPSTALL_Pos)       /*!< CoreDebug DHCSR: C_SNAPSTALL Mask */
+
+#define CoreDebug_DHCSR_C_MASKINTS_Pos      3U                                            /*!< CoreDebug DHCSR: C_MASKINTS Position */
+#define CoreDebug_DHCSR_C_MASKINTS_Msk     (1UL << CoreDebug_DHCSR_C_MASKINTS_Pos)        /*!< CoreDebug DHCSR: C_MASKINTS Mask */
+
+#define CoreDebug_DHCSR_C_STEP_Pos          2U                                            /*!< CoreDebug DHCSR: C_STEP Position */
+#define CoreDebug_DHCSR_C_STEP_Msk         (1UL << CoreDebug_DHCSR_C_STEP_Pos)            /*!< CoreDebug DHCSR: C_STEP Mask */
+
+#define CoreDebug_DHCSR_C_HALT_Pos          1U                                            /*!< CoreDebug DHCSR: C_HALT Position */
+#define CoreDebug_DHCSR_C_HALT_Msk         (1UL << CoreDebug_DHCSR_C_HALT_Pos)            /*!< CoreDebug DHCSR: C_HALT Mask */
+
+#define CoreDebug_DHCSR_C_DEBUGEN_Pos       0U                                            /*!< CoreDebug DHCSR: C_DEBUGEN Position */
+#define CoreDebug_DHCSR_C_DEBUGEN_Msk      (1UL /*<< CoreDebug_DHCSR_C_DEBUGEN_Pos*/)     /*!< CoreDebug DHCSR: C_DEBUGEN Mask */
+
+/* Debug Core Register Selector Register Definitions */
+#define CoreDebug_DCRSR_REGWnR_Pos         16U                                            /*!< CoreDebug DCRSR: REGWnR Position */
+#define CoreDebug_DCRSR_REGWnR_Msk         (1UL << CoreDebug_DCRSR_REGWnR_Pos)            /*!< CoreDebug DCRSR: REGWnR Mask */
+
+#define CoreDebug_DCRSR_REGSEL_Pos          0U                                            /*!< CoreDebug DCRSR: REGSEL Position */
+#define CoreDebug_DCRSR_REGSEL_Msk         (0x1FUL /*<< CoreDebug_DCRSR_REGSEL_Pos*/)     /*!< CoreDebug DCRSR: REGSEL Mask */
+
+/* Debug Exception and Monitor Control Register Definitions */
+#define CoreDebug_DEMCR_TRCENA_Pos         24U                                            /*!< CoreDebug DEMCR: TRCENA Position */
+#define CoreDebug_DEMCR_TRCENA_Msk         (1UL << CoreDebug_DEMCR_TRCENA_Pos)            /*!< CoreDebug DEMCR: TRCENA Mask */
+
+#define CoreDebug_DEMCR_MON_REQ_Pos        19U                                            /*!< CoreDebug DEMCR: MON_REQ Position */
+#define CoreDebug_DEMCR_MON_REQ_Msk        (1UL << CoreDebug_DEMCR_MON_REQ_Pos)           /*!< CoreDebug DEMCR: MON_REQ Mask */
+
+#define CoreDebug_DEMCR_MON_STEP_Pos       18U                                            /*!< CoreDebug DEMCR: MON_STEP Position */
+#define CoreDebug_DEMCR_MON_STEP_Msk       (1UL << CoreDebug_DEMCR_MON_STEP_Pos)          /*!< CoreDebug DEMCR: MON_STEP Mask */
+
+#define CoreDebug_DEMCR_MON_PEND_Pos       17U                                            /*!< CoreDebug DEMCR: MON_PEND Position */
+#define CoreDebug_DEMCR_MON_PEND_Msk       (1UL << CoreDebug_DEMCR_MON_PEND_Pos)          /*!< CoreDebug DEMCR: MON_PEND Mask */
+
+#define CoreDebug_DEMCR_MON_EN_Pos         16U                                            /*!< CoreDebug DEMCR: MON_EN Position */
+#define CoreDebug_DEMCR_MON_EN_Msk         (1UL << CoreDebug_DEMCR_MON_EN_Pos)            /*!< CoreDebug DEMCR: MON_EN Mask */
+
+#define CoreDebug_DEMCR_VC_HARDERR_Pos     10U                                            /*!< CoreDebug DEMCR: VC_HARDERR Position */
+#define CoreDebug_DEMCR_VC_HARDERR_Msk     (1UL << CoreDebug_DEMCR_VC_HARDERR_Pos)        /*!< CoreDebug DEMCR: VC_HARDERR Mask */
+
+#define CoreDebug_DEMCR_VC_INTERR_Pos       9U                                            /*!< CoreDebug DEMCR: VC_INTERR Position */
+#define CoreDebug_DEMCR_VC_INTERR_Msk      (1UL << CoreDebug_DEMCR_VC_INTERR_Pos)         /*!< CoreDebug DEMCR: VC_INTERR Mask */
+
+#define CoreDebug_DEMCR_VC_BUSERR_Pos       8U                                            /*!< CoreDebug DEMCR: VC_BUSERR Position */
+#define CoreDebug_DEMCR_VC_BUSERR_Msk      (1UL << CoreDebug_DEMCR_VC_BUSERR_Pos)         /*!< CoreDebug DEMCR: VC_BUSERR Mask */
+
+#define CoreDebug_DEMCR_VC_STATERR_Pos      7U                                            /*!< CoreDebug DEMCR: VC_STATERR Position */
+#define CoreDebug_DEMCR_VC_STATERR_Msk     (1UL << CoreDebug_DEMCR_VC_STATERR_Pos)        /*!< CoreDebug DEMCR: VC_STATERR Mask */
+
+#define CoreDebug_DEMCR_VC_CHKERR_Pos       6U                                            /*!< CoreDebug DEMCR: VC_CHKERR Position */
+#define CoreDebug_DEMCR_VC_CHKERR_Msk      (1UL << CoreDebug_DEMCR_VC_CHKERR_Pos)         /*!< CoreDebug DEMCR: VC_CHKERR Mask */
+
+#define CoreDebug_DEMCR_VC_NOCPERR_Pos      5U                                            /*!< CoreDebug DEMCR: VC_NOCPERR Position */
+#define CoreDebug_DEMCR_VC_NOCPERR_Msk     (1UL << CoreDebug_DEMCR_VC_NOCPERR_Pos)        /*!< CoreDebug DEMCR: VC_NOCPERR Mask */
+
+#define CoreDebug_DEMCR_VC_MMERR_Pos        4U                                            /*!< CoreDebug DEMCR: VC_MMERR Position */
+#define CoreDebug_DEMCR_VC_MMERR_Msk       (1UL << CoreDebug_DEMCR_VC_MMERR_Pos)          /*!< CoreDebug DEMCR: VC_MMERR Mask */
+
+#define CoreDebug_DEMCR_VC_CORERESET_Pos    0U                                            /*!< CoreDebug DEMCR: VC_CORERESET Position */
+#define CoreDebug_DEMCR_VC_CORERESET_Msk   (1UL /*<< CoreDebug_DEMCR_VC_CORERESET_Pos*/)  /*!< CoreDebug DEMCR: VC_CORERESET Mask */
+
+/*@} end of group CMSIS_CoreDebug */
+
+
+/**
+  \ingroup    CMSIS_core_register
+  \defgroup   CMSIS_core_bitfield     Core register bit field macros
+  \brief      Macros for use with bit field definitions (xxx_Pos, xxx_Msk).
+  @{
+ */
+
+/**
+  \brief   Mask and shift a bit field value for use in a register bit range.
+  \param[in] field  Name of the register bit field.
+  \param[in] value  Value of the bit field. This parameter is interpreted as an uint32_t type.
+  \return           Masked and shifted value.
+*/
+#define _VAL2FLD(field, value)    (((uint32_t)(value) << field ## _Pos) & field ## _Msk)
+
+/**
+  \brief     Mask and shift a register value to extract a bit filed value.
+  \param[in] field  Name of the register bit field.
+  \param[in] value  Value of register. This parameter is interpreted as an uint32_t type.
+  \return           Masked and shifted bit field value.
+*/
+#define _FLD2VAL(field, value)    (((uint32_t)(value) & field ## _Msk) >> field ## _Pos)
+
+/*@} end of group CMSIS_core_bitfield */
+
+
+/**
+  \ingroup    CMSIS_core_register
+  \defgroup   CMSIS_core_base     Core Definitions
+  \brief      Definitions for base addresses, unions, and structures.
+  @{
+ */
+
+/* Memory mapping of Core Hardware */
+#define SCS_BASE            (0xE000E000UL)                            /*!< System Control Space Base Address */
+#define ITM_BASE            (0xE0000000UL)                            /*!< ITM Base Address */
+#define DWT_BASE            (0xE0001000UL)                            /*!< DWT Base Address */
+#define TPI_BASE            (0xE0040000UL)                            /*!< TPI Base Address */
+#define CoreDebug_BASE      (0xE000EDF0UL)                            /*!< Core Debug Base Address */
+#define SysTick_BASE        (SCS_BASE +  0x0010UL)                    /*!< SysTick Base Address */
+#define NVIC_BASE           (SCS_BASE +  0x0100UL)                    /*!< NVIC Base Address */
+#define SCB_BASE            (SCS_BASE +  0x0D00UL)                    /*!< System Control Block Base Address */
+
+#define SCnSCB              ((SCnSCB_Type    *)     SCS_BASE      )   /*!< System control Register not in SCB */
+#define SCB                 ((SCB_Type       *)     SCB_BASE      )   /*!< SCB configuration struct */
+#define SysTick             ((SysTick_Type   *)     SysTick_BASE  )   /*!< SysTick configuration struct */
+#define NVIC                ((NVIC_Type      *)     NVIC_BASE     )   /*!< NVIC configuration struct */
+#define ITM                 ((ITM_Type       *)     ITM_BASE      )   /*!< ITM configuration struct */
+#define DWT                 ((DWT_Type       *)     DWT_BASE      )   /*!< DWT configuration struct */
+#define TPI                 ((TPI_Type       *)     TPI_BASE      )   /*!< TPI configuration struct */
+#define CoreDebug           ((CoreDebug_Type *)     CoreDebug_BASE)   /*!< Core Debug configuration struct */
+
+#if defined (__MPU_PRESENT) && (__MPU_PRESENT == 1U)
+  #define MPU_BASE          (SCS_BASE +  0x0D90UL)                    /*!< Memory Protection Unit */
+  #define MPU               ((MPU_Type       *)     MPU_BASE      )   /*!< Memory Protection Unit */
+#endif
+
+#define FPU_BASE            (SCS_BASE +  0x0F30UL)                    /*!< Floating Point Unit */
+#define FPU                 ((FPU_Type       *)     FPU_BASE      )   /*!< Floating Point Unit */
+
+/*@} */
+
+
+
+/*******************************************************************************
+ *                Hardware Abstraction Layer
+  Core Function Interface contains:
+  - Core NVIC Functions
+  - Core SysTick Functions
+  - Core Debug Functions
+  - Core Register Access Functions
+ ******************************************************************************/
+/**
+  \defgroup CMSIS_Core_FunctionInterface Functions and Instructions Reference
+*/
+
+
+
+/* ##########################   NVIC functions  #################################### */
+/**
+  \ingroup  CMSIS_Core_FunctionInterface
+  \defgroup CMSIS_Core_NVICFunctions NVIC Functions
+  \brief    Functions that manage interrupts and exceptions via the NVIC.
+  @{
+ */
+
+#ifdef CMSIS_NVIC_VIRTUAL
+  #ifndef CMSIS_NVIC_VIRTUAL_HEADER_FILE
+    #define CMSIS_NVIC_VIRTUAL_HEADER_FILE "cmsis_nvic_virtual.h"
+  #endif
+  #include CMSIS_NVIC_VIRTUAL_HEADER_FILE
+#else
+  #define NVIC_SetPriorityGrouping    __NVIC_SetPriorityGrouping
+  #define NVIC_GetPriorityGrouping    __NVIC_GetPriorityGrouping
+  #define NVIC_EnableIRQ              __NVIC_EnableIRQ
+  #define NVIC_GetEnableIRQ           __NVIC_GetEnableIRQ
+  #define NVIC_DisableIRQ             __NVIC_DisableIRQ
+  #define NVIC_GetPendingIRQ          __NVIC_GetPendingIRQ
+  #define NVIC_SetPendingIRQ          __NVIC_SetPendingIRQ
+  #define NVIC_ClearPendingIRQ        __NVIC_ClearPendingIRQ
+  #define NVIC_GetActive              __NVIC_GetActive
+  #define NVIC_SetPriority            __NVIC_SetPriority
+  #define NVIC_GetPriority            __NVIC_GetPriority
+  #define NVIC_SystemReset            __NVIC_SystemReset
+#endif /* CMSIS_NVIC_VIRTUAL */
+
+#ifdef CMSIS_VECTAB_VIRTUAL
+  #ifndef CMSIS_VECTAB_VIRTUAL_HEADER_FILE
+    #define CMSIS_VECTAB_VIRTUAL_HEADER_FILE "cmsis_vectab_virtual.h"
+  #endif
+  #include CMSIS_VECTAB_VIRTUAL_HEADER_FILE
+#else
+  #define NVIC_SetVector              __NVIC_SetVector
+  #define NVIC_GetVector              __NVIC_GetVector
+#endif  /* (CMSIS_VECTAB_VIRTUAL) */
+
+#define NVIC_USER_IRQ_OFFSET          16
+
+
+/* The following EXC_RETURN values are saved the LR on exception entry */
+#define EXC_RETURN_HANDLER         (0xFFFFFFF1UL)     /* return to Handler mode, uses MSP after return                               */
+#define EXC_RETURN_THREAD_MSP      (0xFFFFFFF9UL)     /* return to Thread mode, uses MSP after return                                */
+#define EXC_RETURN_THREAD_PSP      (0xFFFFFFFDUL)     /* return to Thread mode, uses PSP after return                                */
+#define EXC_RETURN_HANDLER_FPU     (0xFFFFFFE1UL)     /* return to Handler mode, uses MSP after return, restore floating-point state */
+#define EXC_RETURN_THREAD_MSP_FPU  (0xFFFFFFE9UL)     /* return to Thread mode, uses MSP after return, restore floating-point state  */
+#define EXC_RETURN_THREAD_PSP_FPU  (0xFFFFFFEDUL)     /* return to Thread mode, uses PSP after return, restore floating-point state  */
+
+
+/**
+  \brief   Set Priority Grouping
+  \details Sets the priority grouping field using the required unlock sequence.
+           The parameter PriorityGroup is assigned to the field SCB->AIRCR [10:8] PRIGROUP field.
+           Only values from 0..7 are used.
+           In case of a conflict between priority grouping and available
+           priority bits (__NVIC_PRIO_BITS), the smallest possible priority group is set.
+  \param [in]      PriorityGroup  Priority grouping field.
+ */
+__STATIC_INLINE void __NVIC_SetPriorityGrouping(uint32_t PriorityGroup)
+{
+  uint32_t reg_value;
+  uint32_t PriorityGroupTmp = (PriorityGroup & (uint32_t)0x07UL);             /* only values 0..7 are used          */
+
+  reg_value  =  SCB->AIRCR;                                                   /* read old register configuration    */
+  reg_value &= ~((uint32_t)(SCB_AIRCR_VECTKEY_Msk | SCB_AIRCR_PRIGROUP_Msk)); /* clear bits to change               */
+  reg_value  =  (reg_value                                   |
+                ((uint32_t)0x5FAUL << SCB_AIRCR_VECTKEY_Pos) |
+                (PriorityGroupTmp << SCB_AIRCR_PRIGROUP_Pos)  );              /* Insert write key and priority group */
+  SCB->AIRCR =  reg_value;
+}
+
+
+/**
+  \brief   Get Priority Grouping
+  \details Reads the priority grouping field from the NVIC Interrupt Controller.
+  \return                Priority grouping field (SCB->AIRCR [10:8] PRIGROUP field).
+ */
+__STATIC_INLINE uint32_t __NVIC_GetPriorityGrouping(void)
+{
+  return ((uint32_t)((SCB->AIRCR & SCB_AIRCR_PRIGROUP_Msk) >> SCB_AIRCR_PRIGROUP_Pos));
+}
+
+
+/**
+  \brief   Enable Interrupt
+  \details Enables a device specific interrupt in the NVIC interrupt controller.
+  \param [in]      IRQn  Device specific interrupt number.
+  \note    IRQn must not be negative.
+ */
+__STATIC_INLINE void __NVIC_EnableIRQ(IRQn_Type IRQn)
+{
+  if ((int32_t)(IRQn) >= 0)
+  {
+    __COMPILER_BARRIER();
+    NVIC->ISER[(((uint32_t)IRQn) >> 5UL)] = (uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL));
+    __COMPILER_BARRIER();
+  }
+}
+
+
+/**
+  \brief   Get Interrupt Enable status
+  \details Returns a device specific interrupt enable status from the NVIC interrupt controller.
+  \param [in]      IRQn  Device specific interrupt number.
+  \return             0  Interrupt is not enabled.
+  \return             1  Interrupt is enabled.
+  \note    IRQn must not be negative.
+ */
+__STATIC_INLINE uint32_t __NVIC_GetEnableIRQ(IRQn_Type IRQn)
+{
+  if ((int32_t)(IRQn) >= 0)
+  {
+    return((uint32_t)(((NVIC->ISER[(((uint32_t)IRQn) >> 5UL)] & (1UL << (((uint32_t)IRQn) & 0x1FUL))) != 0UL) ? 1UL : 0UL));
+  }
+  else
+  {
+    return(0U);
+  }
+}
+
+
+/**
+  \brief   Disable Interrupt
+  \details Disables a device specific interrupt in the NVIC interrupt controller.
+  \param [in]      IRQn  Device specific interrupt number.
+  \note    IRQn must not be negative.
+ */
+__STATIC_INLINE void __NVIC_DisableIRQ(IRQn_Type IRQn)
+{
+  if ((int32_t)(IRQn) >= 0)
+  {
+    NVIC->ICER[(((uint32_t)IRQn) >> 5UL)] = (uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL));
+    __DSB();
+    __ISB();
+  }
+}
+
+
+/**
+  \brief   Get Pending Interrupt
+  \details Reads the NVIC pending register and returns the pending bit for the specified device specific interrupt.
+  \param [in]      IRQn  Device specific interrupt number.
+  \return             0  Interrupt status is not pending.
+  \return             1  Interrupt status is pending.
+  \note    IRQn must not be negative.
+ */
+__STATIC_INLINE uint32_t __NVIC_GetPendingIRQ(IRQn_Type IRQn)
+{
+  if ((int32_t)(IRQn) >= 0)
+  {
+    return((uint32_t)(((NVIC->ISPR[(((uint32_t)IRQn) >> 5UL)] & (1UL << (((uint32_t)IRQn) & 0x1FUL))) != 0UL) ? 1UL : 0UL));
+  }
+  else
+  {
+    return(0U);
+  }
+}
+
+
+/**
+  \brief   Set Pending Interrupt
+  \details Sets the pending bit of a device specific interrupt in the NVIC pending register.
+  \param [in]      IRQn  Device specific interrupt number.
+  \note    IRQn must not be negative.
+ */
+__STATIC_INLINE void __NVIC_SetPendingIRQ(IRQn_Type IRQn)
+{
+  if ((int32_t)(IRQn) >= 0)
+  {
+    NVIC->ISPR[(((uint32_t)IRQn) >> 5UL)] = (uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL));
+  }
+}
+
+
+/**
+  \brief   Clear Pending Interrupt
+  \details Clears the pending bit of a device specific interrupt in the NVIC pending register.
+  \param [in]      IRQn  Device specific interrupt number.
+  \note    IRQn must not be negative.
+ */
+__STATIC_INLINE void __NVIC_ClearPendingIRQ(IRQn_Type IRQn)
+{
+  if ((int32_t)(IRQn) >= 0)
+  {
+    NVIC->ICPR[(((uint32_t)IRQn) >> 5UL)] = (uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL));
+  }
+}
+
+
+/**
+  \brief   Get Active Interrupt
+  \details Reads the active register in the NVIC and returns the active bit for the device specific interrupt.
+  \param [in]      IRQn  Device specific interrupt number.
+  \return             0  Interrupt status is not active.
+  \return             1  Interrupt status is active.
+  \note    IRQn must not be negative.
+ */
+__STATIC_INLINE uint32_t __NVIC_GetActive(IRQn_Type IRQn)
+{
+  if ((int32_t)(IRQn) >= 0)
+  {
+    return((uint32_t)(((NVIC->IABR[(((uint32_t)IRQn) >> 5UL)] & (1UL << (((uint32_t)IRQn) & 0x1FUL))) != 0UL) ? 1UL : 0UL));
+  }
+  else
+  {
+    return(0U);
+  }
+}
+
+
+/**
+  \brief   Set Interrupt Priority
+  \details Sets the priority of a device specific interrupt or a processor exception.
+           The interrupt number can be positive to specify a device specific interrupt,
+           or negative to specify a processor exception.
+  \param [in]      IRQn  Interrupt number.
+  \param [in]  priority  Priority to set.
+  \note    The priority cannot be set for every processor exception.
+ */
+__STATIC_INLINE void __NVIC_SetPriority(IRQn_Type IRQn, uint32_t priority)
+{
+  if ((int32_t)(IRQn) >= 0)
+  {
+    NVIC->IP[((uint32_t)IRQn)]               = (uint8_t)((priority << (8U - __NVIC_PRIO_BITS)) & (uint32_t)0xFFUL);
+  }
+  else
+  {
+    SCB->SHP[(((uint32_t)IRQn) & 0xFUL)-4UL] = (uint8_t)((priority << (8U - __NVIC_PRIO_BITS)) & (uint32_t)0xFFUL);
+  }
+}
+
+
+/**
+  \brief   Get Interrupt Priority
+  \details Reads the priority of a device specific interrupt or a processor exception.
+           The interrupt number can be positive to specify a device specific interrupt,
+           or negative to specify a processor exception.
+  \param [in]   IRQn  Interrupt number.
+  \return             Interrupt Priority.
+                      Value is aligned automatically to the implemented priority bits of the microcontroller.
+ */
+__STATIC_INLINE uint32_t __NVIC_GetPriority(IRQn_Type IRQn)
+{
+
+  if ((int32_t)(IRQn) >= 0)
+  {
+    return(((uint32_t)NVIC->IP[((uint32_t)IRQn)]               >> (8U - __NVIC_PRIO_BITS)));
+  }
+  else
+  {
+    return(((uint32_t)SCB->SHP[(((uint32_t)IRQn) & 0xFUL)-4UL] >> (8U - __NVIC_PRIO_BITS)));
+  }
+}
+
+
+/**
+  \brief   Encode Priority
+  \details Encodes the priority for an interrupt with the given priority group,
+           preemptive priority value, and subpriority value.
+           In case of a conflict between priority grouping and available
+           priority bits (__NVIC_PRIO_BITS), the smallest possible priority group is set.
+  \param [in]     PriorityGroup  Used priority group.
+  \param [in]   PreemptPriority  Preemptive priority value (starting from 0).
+  \param [in]       SubPriority  Subpriority value (starting from 0).
+  \return                        Encoded priority. Value can be used in the function \ref NVIC_SetPriority().
+ */
+__STATIC_INLINE uint32_t NVIC_EncodePriority (uint32_t PriorityGroup, uint32_t PreemptPriority, uint32_t SubPriority)
+{
+  uint32_t PriorityGroupTmp = (PriorityGroup & (uint32_t)0x07UL);   /* only values 0..7 are used          */
+  uint32_t PreemptPriorityBits;
+  uint32_t SubPriorityBits;
+
+  PreemptPriorityBits = ((7UL - PriorityGroupTmp) > (uint32_t)(__NVIC_PRIO_BITS)) ? (uint32_t)(__NVIC_PRIO_BITS) : (uint32_t)(7UL - PriorityGroupTmp);
+  SubPriorityBits     = ((PriorityGroupTmp + (uint32_t)(__NVIC_PRIO_BITS)) < (uint32_t)7UL) ? (uint32_t)0UL : (uint32_t)((PriorityGroupTmp - 7UL) + (uint32_t)(__NVIC_PRIO_BITS));
+
+  return (
+           ((PreemptPriority & (uint32_t)((1UL << (PreemptPriorityBits)) - 1UL)) << SubPriorityBits) |
+           ((SubPriority     & (uint32_t)((1UL << (SubPriorityBits    )) - 1UL)))
+         );
+}
+
+
+/**
+  \brief   Decode Priority
+  \details Decodes an interrupt priority value with a given priority group to
+           preemptive priority value and subpriority value.
+           In case of a conflict between priority grouping and available
+           priority bits (__NVIC_PRIO_BITS) the smallest possible priority group is set.
+  \param [in]         Priority   Priority value, which can be retrieved with the function \ref NVIC_GetPriority().
+  \param [in]     PriorityGroup  Used priority group.
+  \param [out] pPreemptPriority  Preemptive priority value (starting from 0).
+  \param [out]     pSubPriority  Subpriority value (starting from 0).
+ */
+__STATIC_INLINE void NVIC_DecodePriority (uint32_t Priority, uint32_t PriorityGroup, uint32_t* const pPreemptPriority, uint32_t* const pSubPriority)
+{
+  uint32_t PriorityGroupTmp = (PriorityGroup & (uint32_t)0x07UL);   /* only values 0..7 are used          */
+  uint32_t PreemptPriorityBits;
+  uint32_t SubPriorityBits;
+
+  PreemptPriorityBits = ((7UL - PriorityGroupTmp) > (uint32_t)(__NVIC_PRIO_BITS)) ? (uint32_t)(__NVIC_PRIO_BITS) : (uint32_t)(7UL - PriorityGroupTmp);
+  SubPriorityBits     = ((PriorityGroupTmp + (uint32_t)(__NVIC_PRIO_BITS)) < (uint32_t)7UL) ? (uint32_t)0UL : (uint32_t)((PriorityGroupTmp - 7UL) + (uint32_t)(__NVIC_PRIO_BITS));
+
+  *pPreemptPriority = (Priority >> SubPriorityBits) & (uint32_t)((1UL << (PreemptPriorityBits)) - 1UL);
+  *pSubPriority     = (Priority                   ) & (uint32_t)((1UL << (SubPriorityBits    )) - 1UL);
+}
+
+
+/**
+  \brief   Set Interrupt Vector
+  \details Sets an interrupt vector in SRAM based interrupt vector table.
+           The interrupt number can be positive to specify a device specific interrupt,
+           or negative to specify a processor exception.
+           VTOR must been relocated to SRAM before.
+  \param [in]   IRQn      Interrupt number
+  \param [in]   vector    Address of interrupt handler function
+ */
+__STATIC_INLINE void __NVIC_SetVector(IRQn_Type IRQn, uint32_t vector)
+{
+  uint32_t *vectors = (uint32_t *)SCB->VTOR;
+  vectors[(int32_t)IRQn + NVIC_USER_IRQ_OFFSET] = vector;
+  /* ARM Application Note 321 states that the M4 does not require the architectural barrier */
+}
+
+
+/**
+  \brief   Get Interrupt Vector
+  \details Reads an interrupt vector from interrupt vector table.
+           The interrupt number can be positive to specify a device specific interrupt,
+           or negative to specify a processor exception.
+  \param [in]   IRQn      Interrupt number.
+  \return                 Address of interrupt handler function
+ */
+__STATIC_INLINE uint32_t __NVIC_GetVector(IRQn_Type IRQn)
+{
+  uint32_t *vectors = (uint32_t *)SCB->VTOR;
+  return vectors[(int32_t)IRQn + NVIC_USER_IRQ_OFFSET];
+}
+
+
+/**
+  \brief   System Reset
+  \details Initiates a system reset request to reset the MCU.
+ */
+__NO_RETURN __STATIC_INLINE void __NVIC_SystemReset(void)
+{
+  __DSB();                                                          /* Ensure all outstanding memory accesses included
+                                                                       buffered write are completed before reset */
+  SCB->AIRCR  = (uint32_t)((0x5FAUL << SCB_AIRCR_VECTKEY_Pos)    |
+                           (SCB->AIRCR & SCB_AIRCR_PRIGROUP_Msk) |
+                            SCB_AIRCR_SYSRESETREQ_Msk    );         /* Keep priority group unchanged */
+  __DSB();                                                          /* Ensure completion of memory access */
+
+  for(;;)                                                           /* wait until reset */
+  {
+    __NOP();
+  }
+}
+
+/*@} end of CMSIS_Core_NVICFunctions */
+
+
+/* ##########################  MPU functions  #################################### */
+
+#if defined (__MPU_PRESENT) && (__MPU_PRESENT == 1U)
+
+#include "mpu_armv7.h"
+
+#endif
+
+
+/* ##########################  FPU functions  #################################### */
+/**
+  \ingroup  CMSIS_Core_FunctionInterface
+  \defgroup CMSIS_Core_FpuFunctions FPU Functions
+  \brief    Function that provides FPU type.
+  @{
+ */
+
+/**
+  \brief   get FPU type
+  \details returns the FPU type
+  \returns
+   - \b  0: No FPU
+   - \b  1: Single precision FPU
+   - \b  2: Double + Single precision FPU
+ */
+__STATIC_INLINE uint32_t SCB_GetFPUType(void)
+{
+  uint32_t mvfr0;
+
+  mvfr0 = FPU->MVFR0;
+  if      ((mvfr0 & (FPU_MVFR0_Single_precision_Msk | FPU_MVFR0_Double_precision_Msk)) == 0x020U)
+  {
+    return 1U;           /* Single precision FPU */
+  }
+  else
+  {
+    return 0U;           /* No FPU */
+  }
+}
+
+
+/*@} end of CMSIS_Core_FpuFunctions */
+
+
+
+/* ##################################    SysTick function  ############################################ */
+/**
+  \ingroup  CMSIS_Core_FunctionInterface
+  \defgroup CMSIS_Core_SysTickFunctions SysTick Functions
+  \brief    Functions that configure the System.
+  @{
+ */
+
+#if defined (__Vendor_SysTickConfig) && (__Vendor_SysTickConfig == 0U)
+
+/**
+  \brief   System Tick Configuration
+  \details Initializes the System Timer and its interrupt, and starts the System Tick Timer.
+           Counter is in free running mode to generate periodic interrupts.
+  \param [in]  ticks  Number of ticks between two interrupts.
+  \return          0  Function succeeded.
+  \return          1  Function failed.
+  \note    When the variable <b>__Vendor_SysTickConfig</b> is set to 1, then the
+           function <b>SysTick_Config</b> is not included. In this case, the file <b><i>device</i>.h</b>
+           must contain a vendor-specific implementation of this function.
+ */
+__STATIC_INLINE uint32_t SysTick_Config(uint32_t ticks)
+{
+  if ((ticks - 1UL) > SysTick_LOAD_RELOAD_Msk)
+  {
+    return (1UL);                                                   /* Reload value impossible */
+  }
+
+  SysTick->LOAD  = (uint32_t)(ticks - 1UL);                         /* set reload register */
+  NVIC_SetPriority (SysTick_IRQn, (1UL << __NVIC_PRIO_BITS) - 1UL); /* set Priority for Systick Interrupt */
+  SysTick->VAL   = 0UL;                                             /* Load the SysTick Counter Value */
+  SysTick->CTRL  = SysTick_CTRL_CLKSOURCE_Msk |
+                   SysTick_CTRL_TICKINT_Msk   |
+                   SysTick_CTRL_ENABLE_Msk;                         /* Enable SysTick IRQ and SysTick Timer */
+  return (0UL);                                                     /* Function successful */
+}
+
+#endif
+
+/*@} end of CMSIS_Core_SysTickFunctions */
+
+
+
+/* ##################################### Debug In/Output function ########################################### */
+/**
+  \ingroup  CMSIS_Core_FunctionInterface
+  \defgroup CMSIS_core_DebugFunctions ITM Functions
+  \brief    Functions that access the ITM debug interface.
+  @{
+ */
+
+extern volatile int32_t ITM_RxBuffer;                              /*!< External variable to receive characters. */
+#define                 ITM_RXBUFFER_EMPTY  ((int32_t)0x5AA55AA5U) /*!< Value identifying \ref ITM_RxBuffer is ready for next character. */
+
+
+/**
+  \brief   ITM Send Character
+  \details Transmits a character via the ITM channel 0, and
+           \li Just returns when no debugger is connected that has booked the output.
+           \li Is blocking when a debugger is connected, but the previous character sent has not been transmitted.
+  \param [in]     ch  Character to transmit.
+  \returns            Character to transmit.
+ */
+__STATIC_INLINE uint32_t ITM_SendChar (uint32_t ch)
+{
+  if (((ITM->TCR & ITM_TCR_ITMENA_Msk) != 0UL) &&      /* ITM enabled */
+      ((ITM->TER & 1UL               ) != 0UL)   )     /* ITM Port #0 enabled */
+  {
+    while (ITM->PORT[0U].u32 == 0UL)
+    {
+      __NOP();
+    }
+    ITM->PORT[0U].u8 = (uint8_t)ch;
+  }
+  return (ch);
+}
+
+
+/**
+  \brief   ITM Receive Character
+  \details Inputs a character via the external variable \ref ITM_RxBuffer.
+  \return             Received character.
+  \return         -1  No character pending.
+ */
+__STATIC_INLINE int32_t ITM_ReceiveChar (void)
+{
+  int32_t ch = -1;                           /* no character available */
+
+  if (ITM_RxBuffer != ITM_RXBUFFER_EMPTY)
+  {
+    ch = ITM_RxBuffer;
+    ITM_RxBuffer = ITM_RXBUFFER_EMPTY;       /* ready for next character */
+  }
+
+  return (ch);
+}
+
+
+/**
+  \brief   ITM Check Character
+  \details Checks whether a character is pending for reading in the variable \ref ITM_RxBuffer.
+  \return          0  No character available.
+  \return          1  Character available.
+ */
+__STATIC_INLINE int32_t ITM_CheckChar (void)
+{
+
+  if (ITM_RxBuffer == ITM_RXBUFFER_EMPTY)
+  {
+    return (0);                              /* no character available */
+  }
+  else
+  {
+    return (1);                              /*    character available */
+  }
+}
+
+/*@} end of CMSIS_core_DebugFunctions */
+
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __CORE_CM4_H_DEPENDANT */
+
+#endif /* __CMSIS_GENERIC */

+ 275 - 0
libraries/cmsis/cm4/core_support/mpu_armv7.h

@@ -0,0 +1,275 @@
+/******************************************************************************
+ * @file     mpu_armv7.h
+ * @brief    CMSIS MPU API for Armv7-M MPU
+ * @version  V5.1.1
+ * @date     10. February 2020
+ ******************************************************************************/
+/*
+ * Copyright (c) 2017-2020 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if   defined ( __ICCARM__ )
+  #pragma system_include         /* treat file as system include file for MISRA check */
+#elif defined (__clang__)
+  #pragma clang system_header    /* treat file as system include file */
+#endif
+
+#ifndef ARM_MPU_ARMV7_H
+#define ARM_MPU_ARMV7_H
+
+#define ARM_MPU_REGION_SIZE_32B      ((uint8_t)0x04U) ///!< MPU Region Size 32 Bytes
+#define ARM_MPU_REGION_SIZE_64B      ((uint8_t)0x05U) ///!< MPU Region Size 64 Bytes
+#define ARM_MPU_REGION_SIZE_128B     ((uint8_t)0x06U) ///!< MPU Region Size 128 Bytes
+#define ARM_MPU_REGION_SIZE_256B     ((uint8_t)0x07U) ///!< MPU Region Size 256 Bytes
+#define ARM_MPU_REGION_SIZE_512B     ((uint8_t)0x08U) ///!< MPU Region Size 512 Bytes
+#define ARM_MPU_REGION_SIZE_1KB      ((uint8_t)0x09U) ///!< MPU Region Size 1 KByte
+#define ARM_MPU_REGION_SIZE_2KB      ((uint8_t)0x0AU) ///!< MPU Region Size 2 KBytes
+#define ARM_MPU_REGION_SIZE_4KB      ((uint8_t)0x0BU) ///!< MPU Region Size 4 KBytes
+#define ARM_MPU_REGION_SIZE_8KB      ((uint8_t)0x0CU) ///!< MPU Region Size 8 KBytes
+#define ARM_MPU_REGION_SIZE_16KB     ((uint8_t)0x0DU) ///!< MPU Region Size 16 KBytes
+#define ARM_MPU_REGION_SIZE_32KB     ((uint8_t)0x0EU) ///!< MPU Region Size 32 KBytes
+#define ARM_MPU_REGION_SIZE_64KB     ((uint8_t)0x0FU) ///!< MPU Region Size 64 KBytes
+#define ARM_MPU_REGION_SIZE_128KB    ((uint8_t)0x10U) ///!< MPU Region Size 128 KBytes
+#define ARM_MPU_REGION_SIZE_256KB    ((uint8_t)0x11U) ///!< MPU Region Size 256 KBytes
+#define ARM_MPU_REGION_SIZE_512KB    ((uint8_t)0x12U) ///!< MPU Region Size 512 KBytes
+#define ARM_MPU_REGION_SIZE_1MB      ((uint8_t)0x13U) ///!< MPU Region Size 1 MByte
+#define ARM_MPU_REGION_SIZE_2MB      ((uint8_t)0x14U) ///!< MPU Region Size 2 MBytes
+#define ARM_MPU_REGION_SIZE_4MB      ((uint8_t)0x15U) ///!< MPU Region Size 4 MBytes
+#define ARM_MPU_REGION_SIZE_8MB      ((uint8_t)0x16U) ///!< MPU Region Size 8 MBytes
+#define ARM_MPU_REGION_SIZE_16MB     ((uint8_t)0x17U) ///!< MPU Region Size 16 MBytes
+#define ARM_MPU_REGION_SIZE_32MB     ((uint8_t)0x18U) ///!< MPU Region Size 32 MBytes
+#define ARM_MPU_REGION_SIZE_64MB     ((uint8_t)0x19U) ///!< MPU Region Size 64 MBytes
+#define ARM_MPU_REGION_SIZE_128MB    ((uint8_t)0x1AU) ///!< MPU Region Size 128 MBytes
+#define ARM_MPU_REGION_SIZE_256MB    ((uint8_t)0x1BU) ///!< MPU Region Size 256 MBytes
+#define ARM_MPU_REGION_SIZE_512MB    ((uint8_t)0x1CU) ///!< MPU Region Size 512 MBytes
+#define ARM_MPU_REGION_SIZE_1GB      ((uint8_t)0x1DU) ///!< MPU Region Size 1 GByte
+#define ARM_MPU_REGION_SIZE_2GB      ((uint8_t)0x1EU) ///!< MPU Region Size 2 GBytes
+#define ARM_MPU_REGION_SIZE_4GB      ((uint8_t)0x1FU) ///!< MPU Region Size 4 GBytes
+
+#define ARM_MPU_AP_NONE 0U ///!< MPU Access Permission no access
+#define ARM_MPU_AP_PRIV 1U ///!< MPU Access Permission privileged access only
+#define ARM_MPU_AP_URO  2U ///!< MPU Access Permission unprivileged access read-only
+#define ARM_MPU_AP_FULL 3U ///!< MPU Access Permission full access
+#define ARM_MPU_AP_PRO  5U ///!< MPU Access Permission privileged access read-only
+#define ARM_MPU_AP_RO   6U ///!< MPU Access Permission read-only access
+
+/** MPU Region Base Address Register Value
+*
+* \param Region The region to be configured, number 0 to 15.
+* \param BaseAddress The base address for the region.
+*/
+#define ARM_MPU_RBAR(Region, BaseAddress) \
+  (((BaseAddress) & MPU_RBAR_ADDR_Msk) |  \
+   ((Region) & MPU_RBAR_REGION_Msk)    |  \
+   (MPU_RBAR_VALID_Msk))
+
+/**
+* MPU Memory Access Attributes
+*
+* \param TypeExtField      Type extension field, allows you to configure memory access type, for example strongly ordered, peripheral.
+* \param IsShareable       Region is shareable between multiple bus masters.
+* \param IsCacheable       Region is cacheable, i.e. its value may be kept in cache.
+* \param IsBufferable      Region is bufferable, i.e. using write-back caching. Cacheable but non-bufferable regions use write-through policy.
+*/
+#define ARM_MPU_ACCESS_(TypeExtField, IsShareable, IsCacheable, IsBufferable)   \
+  ((((TypeExtField) << MPU_RASR_TEX_Pos) & MPU_RASR_TEX_Msk)                  | \
+   (((IsShareable)  << MPU_RASR_S_Pos)   & MPU_RASR_S_Msk)                    | \
+   (((IsCacheable)  << MPU_RASR_C_Pos)   & MPU_RASR_C_Msk)                    | \
+   (((IsBufferable) << MPU_RASR_B_Pos)   & MPU_RASR_B_Msk))
+
+/**
+* MPU Region Attribute and Size Register Value
+*
+* \param DisableExec       Instruction access disable bit, 1= disable instruction fetches.
+* \param AccessPermission  Data access permissions, allows you to configure read/write access for User and Privileged mode.
+* \param AccessAttributes  Memory access attribution, see \ref ARM_MPU_ACCESS_.
+* \param SubRegionDisable  Sub-region disable field.
+* \param Size              Region size of the region to be configured, for example 4K, 8K.
+*/
+#define ARM_MPU_RASR_EX(DisableExec, AccessPermission, AccessAttributes, SubRegionDisable, Size)    \
+  ((((DisableExec)      << MPU_RASR_XN_Pos)   & MPU_RASR_XN_Msk)                                  | \
+   (((AccessPermission) << MPU_RASR_AP_Pos)   & MPU_RASR_AP_Msk)                                  | \
+   (((AccessAttributes) & (MPU_RASR_TEX_Msk | MPU_RASR_S_Msk | MPU_RASR_C_Msk | MPU_RASR_B_Msk))) | \
+   (((SubRegionDisable) << MPU_RASR_SRD_Pos)  & MPU_RASR_SRD_Msk)                                 | \
+   (((Size)             << MPU_RASR_SIZE_Pos) & MPU_RASR_SIZE_Msk)                                | \
+   (((MPU_RASR_ENABLE_Msk))))
+
+/**
+* MPU Region Attribute and Size Register Value
+*
+* \param DisableExec       Instruction access disable bit, 1= disable instruction fetches.
+* \param AccessPermission  Data access permissions, allows you to configure read/write access for User and Privileged mode.
+* \param TypeExtField      Type extension field, allows you to configure memory access type, for example strongly ordered, peripheral.
+* \param IsShareable       Region is shareable between multiple bus masters.
+* \param IsCacheable       Region is cacheable, i.e. its value may be kept in cache.
+* \param IsBufferable      Region is bufferable, i.e. using write-back caching. Cacheable but non-bufferable regions use write-through policy.
+* \param SubRegionDisable  Sub-region disable field.
+* \param Size              Region size of the region to be configured, for example 4K, 8K.
+*/
+#define ARM_MPU_RASR(DisableExec, AccessPermission, TypeExtField, IsShareable, IsCacheable, IsBufferable, SubRegionDisable, Size) \
+  ARM_MPU_RASR_EX(DisableExec, AccessPermission, ARM_MPU_ACCESS_(TypeExtField, IsShareable, IsCacheable, IsBufferable), SubRegionDisable, Size)
+
+/**
+* MPU Memory Access Attribute for strongly ordered memory.
+*  - TEX: 000b
+*  - Shareable
+*  - Non-cacheable
+*  - Non-bufferable
+*/
+#define ARM_MPU_ACCESS_ORDERED ARM_MPU_ACCESS_(0U, 1U, 0U, 0U)
+
+/**
+* MPU Memory Access Attribute for device memory.
+*  - TEX: 000b (if shareable) or 010b (if non-shareable)
+*  - Shareable or non-shareable
+*  - Non-cacheable
+*  - Bufferable (if shareable) or non-bufferable (if non-shareable)
+*
+* \param IsShareable Configures the device memory as shareable or non-shareable.
+*/
+#define ARM_MPU_ACCESS_DEVICE(IsShareable) ((IsShareable) ? ARM_MPU_ACCESS_(0U, 1U, 0U, 1U) : ARM_MPU_ACCESS_(2U, 0U, 0U, 0U))
+
+/**
+* MPU Memory Access Attribute for normal memory.
+*  - TEX: 1BBb (reflecting outer cacheability rules)
+*  - Shareable or non-shareable
+*  - Cacheable or non-cacheable (reflecting inner cacheability rules)
+*  - Bufferable or non-bufferable (reflecting inner cacheability rules)
+*
+* \param OuterCp Configures the outer cache policy.
+* \param InnerCp Configures the inner cache policy.
+* \param IsShareable Configures the memory as shareable or non-shareable.
+*/
+#define ARM_MPU_ACCESS_NORMAL(OuterCp, InnerCp, IsShareable) ARM_MPU_ACCESS_((4U | (OuterCp)), IsShareable, ((InnerCp) >> 1U), ((InnerCp) & 1U))
+
+/**
+* MPU Memory Access Attribute non-cacheable policy.
+*/
+#define ARM_MPU_CACHEP_NOCACHE 0U
+
+/**
+* MPU Memory Access Attribute write-back, write and read allocate policy.
+*/
+#define ARM_MPU_CACHEP_WB_WRA 1U
+
+/**
+* MPU Memory Access Attribute write-through, no write allocate policy.
+*/
+#define ARM_MPU_CACHEP_WT_NWA 2U
+
+/**
+* MPU Memory Access Attribute write-back, no write allocate policy.
+*/
+#define ARM_MPU_CACHEP_WB_NWA 3U
+
+
+/**
+* Struct for a single MPU Region
+*/
+typedef struct {
+  uint32_t RBAR; //!< The region base address register value (RBAR)
+  uint32_t RASR; //!< The region attribute and size register value (RASR) \ref MPU_RASR
+} ARM_MPU_Region_t;
+
+/** Enable the MPU.
+* \param MPU_Control Default access permissions for unconfigured regions.
+*/
+__STATIC_INLINE void ARM_MPU_Enable(uint32_t MPU_Control)
+{
+  __DMB();
+  MPU->CTRL = MPU_Control | MPU_CTRL_ENABLE_Msk;
+#ifdef SCB_SHCSR_MEMFAULTENA_Msk
+  SCB->SHCSR |= SCB_SHCSR_MEMFAULTENA_Msk;
+#endif
+  __DSB();
+  __ISB();
+}
+
+/** Disable the MPU.
+*/
+__STATIC_INLINE void ARM_MPU_Disable(void)
+{
+  __DMB();
+#ifdef SCB_SHCSR_MEMFAULTENA_Msk
+  SCB->SHCSR &= ~SCB_SHCSR_MEMFAULTENA_Msk;
+#endif
+  MPU->CTRL  &= ~MPU_CTRL_ENABLE_Msk;
+  __DSB();
+  __ISB();
+}
+
+/** Clear and disable the given MPU region.
+* \param rnr Region number to be cleared.
+*/
+__STATIC_INLINE void ARM_MPU_ClrRegion(uint32_t rnr)
+{
+  MPU->RNR = rnr;
+  MPU->RASR = 0U;
+}
+
+/** Configure an MPU region.
+* \param rbar Value for RBAR register.
+* \param rsar Value for RSAR register.
+*/
+__STATIC_INLINE void ARM_MPU_SetRegion(uint32_t rbar, uint32_t rasr)
+{
+  MPU->RBAR = rbar;
+  MPU->RASR = rasr;
+}
+
+/** Configure the given MPU region.
+* \param rnr Region number to be configured.
+* \param rbar Value for RBAR register.
+* \param rsar Value for RSAR register.
+*/
+__STATIC_INLINE void ARM_MPU_SetRegionEx(uint32_t rnr, uint32_t rbar, uint32_t rasr)
+{
+  MPU->RNR = rnr;
+  MPU->RBAR = rbar;
+  MPU->RASR = rasr;
+}
+
+/** Memcopy with strictly ordered memory access, e.g. for register targets.
+* \param dst Destination data is copied to.
+* \param src Source data is copied from.
+* \param len Amount of data words to be copied.
+*/
+__STATIC_INLINE void ARM_MPU_OrderedMemcpy(volatile uint32_t* dst, const uint32_t* __RESTRICT src, uint32_t len)
+{
+  uint32_t i;
+  for (i = 0U; i < len; ++i)
+  {
+    dst[i] = src[i];
+  }
+}
+
+/** Load the given number of MPU regions from a table.
+* \param table Pointer to the MPU configuration table.
+* \param cnt Amount of regions to be configured.
+*/
+__STATIC_INLINE void ARM_MPU_Load(ARM_MPU_Region_t const* table, uint32_t cnt)
+{
+  const uint32_t rowWordSize = sizeof(ARM_MPU_Region_t)/4U;
+  while (cnt > MPU_TYPE_RALIASES) {
+    ARM_MPU_OrderedMemcpy(&(MPU->RBAR), &(table->RBAR), MPU_TYPE_RALIASES*rowWordSize);
+    table += MPU_TYPE_RALIASES;
+    cnt -= MPU_TYPE_RALIASES;
+  }
+  ARM_MPU_OrderedMemcpy(&(MPU->RBAR), &(table->RBAR), cnt*rowWordSize);
+}
+
+#endif

+ 352 - 0
libraries/cmsis/cm4/core_support/mpu_armv8.h

@@ -0,0 +1,352 @@
+/******************************************************************************
+ * @file     mpu_armv8.h
+ * @brief    CMSIS MPU API for Armv8-M and Armv8.1-M MPU
+ * @version  V5.1.2
+ * @date     10. February 2020
+ ******************************************************************************/
+/*
+ * Copyright (c) 2017-2020 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if   defined ( __ICCARM__ )
+  #pragma system_include         /* treat file as system include file for MISRA check */
+#elif defined (__clang__)
+  #pragma clang system_header    /* treat file as system include file */
+#endif
+
+#ifndef ARM_MPU_ARMV8_H
+#define ARM_MPU_ARMV8_H
+
+/** \brief Attribute for device memory (outer only) */
+#define ARM_MPU_ATTR_DEVICE                           ( 0U )
+
+/** \brief Attribute for non-cacheable, normal memory */
+#define ARM_MPU_ATTR_NON_CACHEABLE                    ( 4U )
+
+/** \brief Attribute for normal memory (outer and inner)
+* \param NT Non-Transient: Set to 1 for non-transient data.
+* \param WB Write-Back: Set to 1 to use write-back update policy.
+* \param RA Read Allocation: Set to 1 to use cache allocation on read miss.
+* \param WA Write Allocation: Set to 1 to use cache allocation on write miss.
+*/
+#define ARM_MPU_ATTR_MEMORY_(NT, WB, RA, WA) \
+  ((((NT) & 1U) << 3U) | (((WB) & 1U) << 2U) | (((RA) & 1U) << 1U) | ((WA) & 1U))
+
+/** \brief Device memory type non Gathering, non Re-ordering, non Early Write Acknowledgement */
+#define ARM_MPU_ATTR_DEVICE_nGnRnE (0U)
+
+/** \brief Device memory type non Gathering, non Re-ordering, Early Write Acknowledgement */
+#define ARM_MPU_ATTR_DEVICE_nGnRE  (1U)
+
+/** \brief Device memory type non Gathering, Re-ordering, Early Write Acknowledgement */
+#define ARM_MPU_ATTR_DEVICE_nGRE   (2U)
+
+/** \brief Device memory type Gathering, Re-ordering, Early Write Acknowledgement */
+#define ARM_MPU_ATTR_DEVICE_GRE    (3U)
+
+/** \brief Memory Attribute
+* \param O Outer memory attributes
+* \param I O == ARM_MPU_ATTR_DEVICE: Device memory attributes, else: Inner memory attributes
+*/
+#define ARM_MPU_ATTR(O, I) ((((O) & 0xFU) << 4U) | ((((O) & 0xFU) != 0U) ? ((I) & 0xFU) : (((I) & 0x3U) << 2U)))
+
+/** \brief Normal memory non-shareable  */
+#define ARM_MPU_SH_NON   (0U)
+
+/** \brief Normal memory outer shareable  */
+#define ARM_MPU_SH_OUTER (2U)
+
+/** \brief Normal memory inner shareable  */
+#define ARM_MPU_SH_INNER (3U)
+
+/** \brief Memory access permissions
+* \param RO Read-Only: Set to 1 for read-only memory.
+* \param NP Non-Privileged: Set to 1 for non-privileged memory.
+*/
+#define ARM_MPU_AP_(RO, NP) ((((RO) & 1U) << 1U) | ((NP) & 1U))
+
+/** \brief Region Base Address Register value
+* \param BASE The base address bits [31:5] of a memory region. The value is zero extended. Effective address gets 32 byte aligned.
+* \param SH Defines the Shareability domain for this memory region.
+* \param RO Read-Only: Set to 1 for a read-only memory region.
+* \param NP Non-Privileged: Set to 1 for a non-privileged memory region.
+* \oaram XN eXecute Never: Set to 1 for a non-executable memory region.
+*/
+#define ARM_MPU_RBAR(BASE, SH, RO, NP, XN) \
+  (((BASE) & MPU_RBAR_BASE_Msk) | \
+  (((SH) << MPU_RBAR_SH_Pos) & MPU_RBAR_SH_Msk) | \
+  ((ARM_MPU_AP_(RO, NP) << MPU_RBAR_AP_Pos) & MPU_RBAR_AP_Msk) | \
+  (((XN) << MPU_RBAR_XN_Pos) & MPU_RBAR_XN_Msk))
+
+/** \brief Region Limit Address Register value
+* \param LIMIT The limit address bits [31:5] for this memory region. The value is one extended.
+* \param IDX The attribute index to be associated with this memory region.
+*/
+#define ARM_MPU_RLAR(LIMIT, IDX) \
+  (((LIMIT) & MPU_RLAR_LIMIT_Msk) | \
+  (((IDX) << MPU_RLAR_AttrIndx_Pos) & MPU_RLAR_AttrIndx_Msk) | \
+  (MPU_RLAR_EN_Msk))
+
+#if defined(MPU_RLAR_PXN_Pos)
+
+/** \brief Region Limit Address Register with PXN value
+* \param LIMIT The limit address bits [31:5] for this memory region. The value is one extended.
+* \param PXN Privileged execute never. Defines whether code can be executed from this privileged region.
+* \param IDX The attribute index to be associated with this memory region.
+*/
+#define ARM_MPU_RLAR_PXN(LIMIT, PXN, IDX) \
+  (((LIMIT) & MPU_RLAR_LIMIT_Msk) | \
+  (((PXN) << MPU_RLAR_PXN_Pos) & MPU_RLAR_PXN_Msk) | \
+  (((IDX) << MPU_RLAR_AttrIndx_Pos) & MPU_RLAR_AttrIndx_Msk) | \
+  (MPU_RLAR_EN_Msk))
+
+#endif
+
+/**
+* Struct for a single MPU Region
+*/
+typedef struct {
+  uint32_t RBAR;                   /*!< Region Base Address Register value */
+  uint32_t RLAR;                   /*!< Region Limit Address Register value */
+} ARM_MPU_Region_t;
+
+/** Enable the MPU.
+* \param MPU_Control Default access permissions for unconfigured regions.
+*/
+__STATIC_INLINE void ARM_MPU_Enable(uint32_t MPU_Control)
+{
+  __DMB();
+  MPU->CTRL = MPU_Control | MPU_CTRL_ENABLE_Msk;
+#ifdef SCB_SHCSR_MEMFAULTENA_Msk
+  SCB->SHCSR |= SCB_SHCSR_MEMFAULTENA_Msk;
+#endif
+  __DSB();
+  __ISB();
+}
+
+/** Disable the MPU.
+*/
+__STATIC_INLINE void ARM_MPU_Disable(void)
+{
+  __DMB();
+#ifdef SCB_SHCSR_MEMFAULTENA_Msk
+  SCB->SHCSR &= ~SCB_SHCSR_MEMFAULTENA_Msk;
+#endif
+  MPU->CTRL  &= ~MPU_CTRL_ENABLE_Msk;
+  __DSB();
+  __ISB();
+}
+
+#ifdef MPU_NS
+/** Enable the Non-secure MPU.
+* \param MPU_Control Default access permissions for unconfigured regions.
+*/
+__STATIC_INLINE void ARM_MPU_Enable_NS(uint32_t MPU_Control)
+{
+  __DMB();
+  MPU_NS->CTRL = MPU_Control | MPU_CTRL_ENABLE_Msk;
+#ifdef SCB_SHCSR_MEMFAULTENA_Msk
+  SCB_NS->SHCSR |= SCB_SHCSR_MEMFAULTENA_Msk;
+#endif
+  __DSB();
+  __ISB();
+}
+
+/** Disable the Non-secure MPU.
+*/
+__STATIC_INLINE void ARM_MPU_Disable_NS(void)
+{
+  __DMB();
+#ifdef SCB_SHCSR_MEMFAULTENA_Msk
+  SCB_NS->SHCSR &= ~SCB_SHCSR_MEMFAULTENA_Msk;
+#endif
+  MPU_NS->CTRL  &= ~MPU_CTRL_ENABLE_Msk;
+  __DSB();
+  __ISB();
+}
+#endif
+
+/** Set the memory attribute encoding to the given MPU.
+* \param mpu Pointer to the MPU to be configured.
+* \param idx The attribute index to be set [0-7]
+* \param attr The attribute value to be set.
+*/
+__STATIC_INLINE void ARM_MPU_SetMemAttrEx(MPU_Type* mpu, uint8_t idx, uint8_t attr)
+{
+  const uint8_t reg = idx / 4U;
+  const uint32_t pos = ((idx % 4U) * 8U);
+  const uint32_t mask = 0xFFU << pos;
+
+  if (reg >= (sizeof(mpu->MAIR) / sizeof(mpu->MAIR[0]))) {
+    return; // invalid index
+  }
+
+  mpu->MAIR[reg] = ((mpu->MAIR[reg] & ~mask) | ((attr << pos) & mask));
+}
+
+/** Set the memory attribute encoding.
+* \param idx The attribute index to be set [0-7]
+* \param attr The attribute value to be set.
+*/
+__STATIC_INLINE void ARM_MPU_SetMemAttr(uint8_t idx, uint8_t attr)
+{
+  ARM_MPU_SetMemAttrEx(MPU, idx, attr);
+}
+
+#ifdef MPU_NS
+/** Set the memory attribute encoding to the Non-secure MPU.
+* \param idx The attribute index to be set [0-7]
+* \param attr The attribute value to be set.
+*/
+__STATIC_INLINE void ARM_MPU_SetMemAttr_NS(uint8_t idx, uint8_t attr)
+{
+  ARM_MPU_SetMemAttrEx(MPU_NS, idx, attr);
+}
+#endif
+
+/** Clear and disable the given MPU region of the given MPU.
+* \param mpu Pointer to MPU to be used.
+* \param rnr Region number to be cleared.
+*/
+__STATIC_INLINE void ARM_MPU_ClrRegionEx(MPU_Type* mpu, uint32_t rnr)
+{
+  mpu->RNR = rnr;
+  mpu->RLAR = 0U;
+}
+
+/** Clear and disable the given MPU region.
+* \param rnr Region number to be cleared.
+*/
+__STATIC_INLINE void ARM_MPU_ClrRegion(uint32_t rnr)
+{
+  ARM_MPU_ClrRegionEx(MPU, rnr);
+}
+
+#ifdef MPU_NS
+/** Clear and disable the given Non-secure MPU region.
+* \param rnr Region number to be cleared.
+*/
+__STATIC_INLINE void ARM_MPU_ClrRegion_NS(uint32_t rnr)
+{
+  ARM_MPU_ClrRegionEx(MPU_NS, rnr);
+}
+#endif
+
+/** Configure the given MPU region of the given MPU.
+* \param mpu Pointer to MPU to be used.
+* \param rnr Region number to be configured.
+* \param rbar Value for RBAR register.
+* \param rlar Value for RLAR register.
+*/
+__STATIC_INLINE void ARM_MPU_SetRegionEx(MPU_Type* mpu, uint32_t rnr, uint32_t rbar, uint32_t rlar)
+{
+  mpu->RNR = rnr;
+  mpu->RBAR = rbar;
+  mpu->RLAR = rlar;
+}
+
+/** Configure the given MPU region.
+* \param rnr Region number to be configured.
+* \param rbar Value for RBAR register.
+* \param rlar Value for RLAR register.
+*/
+__STATIC_INLINE void ARM_MPU_SetRegion(uint32_t rnr, uint32_t rbar, uint32_t rlar)
+{
+  ARM_MPU_SetRegionEx(MPU, rnr, rbar, rlar);
+}
+
+#ifdef MPU_NS
+/** Configure the given Non-secure MPU region.
+* \param rnr Region number to be configured.
+* \param rbar Value for RBAR register.
+* \param rlar Value for RLAR register.
+*/
+__STATIC_INLINE void ARM_MPU_SetRegion_NS(uint32_t rnr, uint32_t rbar, uint32_t rlar)
+{
+  ARM_MPU_SetRegionEx(MPU_NS, rnr, rbar, rlar);
+}
+#endif
+
+/** Memcopy with strictly ordered memory access, e.g. for register targets.
+* \param dst Destination data is copied to.
+* \param src Source data is copied from.
+* \param len Amount of data words to be copied.
+*/
+__STATIC_INLINE void ARM_MPU_OrderedMemcpy(volatile uint32_t* dst, const uint32_t* __RESTRICT src, uint32_t len)
+{
+  uint32_t i;
+  for (i = 0U; i < len; ++i)
+  {
+    dst[i] = src[i];
+  }
+}
+
+/** Load the given number of MPU regions from a table to the given MPU.
+* \param mpu Pointer to the MPU registers to be used.
+* \param rnr First region number to be configured.
+* \param table Pointer to the MPU configuration table.
+* \param cnt Amount of regions to be configured.
+*/
+__STATIC_INLINE void ARM_MPU_LoadEx(MPU_Type* mpu, uint32_t rnr, ARM_MPU_Region_t const* table, uint32_t cnt)
+{
+  const uint32_t rowWordSize = sizeof(ARM_MPU_Region_t)/4U;
+  if (cnt == 1U) {
+    mpu->RNR = rnr;
+    ARM_MPU_OrderedMemcpy(&(mpu->RBAR), &(table->RBAR), rowWordSize);
+  } else {
+    uint32_t rnrBase   = rnr & ~(MPU_TYPE_RALIASES-1U);
+    uint32_t rnrOffset = rnr % MPU_TYPE_RALIASES;
+
+    mpu->RNR = rnrBase;
+    while ((rnrOffset + cnt) > MPU_TYPE_RALIASES) {
+      uint32_t c = MPU_TYPE_RALIASES - rnrOffset;
+      ARM_MPU_OrderedMemcpy(&(mpu->RBAR)+(rnrOffset*2U), &(table->RBAR), c*rowWordSize);
+      table += c;
+      cnt -= c;
+      rnrOffset = 0U;
+      rnrBase += MPU_TYPE_RALIASES;
+      mpu->RNR = rnrBase;
+    }
+
+    ARM_MPU_OrderedMemcpy(&(mpu->RBAR)+(rnrOffset*2U), &(table->RBAR), cnt*rowWordSize);
+  }
+}
+
+/** Load the given number of MPU regions from a table.
+* \param rnr First region number to be configured.
+* \param table Pointer to the MPU configuration table.
+* \param cnt Amount of regions to be configured.
+*/
+__STATIC_INLINE void ARM_MPU_Load(uint32_t rnr, ARM_MPU_Region_t const* table, uint32_t cnt)
+{
+  ARM_MPU_LoadEx(MPU, rnr, table, cnt);
+}
+
+#ifdef MPU_NS
+/** Load the given number of MPU regions from a table to the Non-secure MPU.
+* \param rnr First region number to be configured.
+* \param table Pointer to the MPU configuration table.
+* \param cnt Amount of regions to be configured.
+*/
+__STATIC_INLINE void ARM_MPU_Load_NS(uint32_t rnr, ARM_MPU_Region_t const* table, uint32_t cnt)
+{
+  ARM_MPU_LoadEx(MPU_NS, rnr, table, cnt);
+}
+#endif
+
+#endif
+

+ 337 - 0
libraries/cmsis/cm4/core_support/pmu_armv8.h

@@ -0,0 +1,337 @@
+/******************************************************************************
+ * @file     pmu_armv8.h
+ * @brief    CMSIS PMU API for Armv8.1-M PMU
+ * @version  V1.0.0
+ * @date     24. March 2020
+ ******************************************************************************/
+/*
+ * Copyright (c) 2020 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if   defined ( __ICCARM__ )
+  #pragma system_include         /* treat file as system include file for MISRA check */
+#elif defined (__clang__)
+  #pragma clang system_header    /* treat file as system include file */
+#endif
+
+#ifndef ARM_PMU_ARMV8_H
+#define ARM_PMU_ARMV8_H
+
+/**
+ * \brief PMU Events
+ * \note  See the Armv8.1-M Architecture Reference Manual for full details on these PMU events.
+ * */
+
+#define ARM_PMU_SW_INCR                              0x0000             /*!< Software update to the PMU_SWINC register, architecturally executed and condition code check pass */
+#define ARM_PMU_L1I_CACHE_REFILL                     0x0001             /*!< L1 I-Cache refill */
+#define ARM_PMU_L1D_CACHE_REFILL                     0x0003             /*!< L1 D-Cache refill */
+#define ARM_PMU_L1D_CACHE                            0x0004             /*!< L1 D-Cache access */
+#define ARM_PMU_LD_RETIRED                           0x0006             /*!< Memory-reading instruction architecturally executed and condition code check pass */
+#define ARM_PMU_ST_RETIRED                           0x0007             /*!< Memory-writing instruction architecturally executed and condition code check pass */
+#define ARM_PMU_INST_RETIRED                         0x0008             /*!< Instruction architecturally executed */
+#define ARM_PMU_EXC_TAKEN                            0x0009             /*!< Exception entry */
+#define ARM_PMU_EXC_RETURN                           0x000A             /*!< Exception return instruction architecturally executed and the condition code check pass */
+#define ARM_PMU_PC_WRITE_RETIRED                     0x000C             /*!< Software change to the Program Counter (PC). Instruction is architecturally executed and condition code check pass */
+#define ARM_PMU_BR_IMMED_RETIRED                     0x000D             /*!< Immediate branch architecturally executed */
+#define ARM_PMU_BR_RETURN_RETIRED                    0x000E             /*!< Function return instruction architecturally executed and the condition code check pass */
+#define ARM_PMU_UNALIGNED_LDST_RETIRED               0x000F             /*!< Unaligned memory memory-reading or memory-writing instruction architecturally executed and condition code check pass */
+#define ARM_PMU_BR_MIS_PRED                          0x0010             /*!< Mispredicted or not predicted branch speculatively executed */
+#define ARM_PMU_CPU_CYCLES                           0x0011             /*!< Cycle */
+#define ARM_PMU_BR_PRED                              0x0012             /*!< Predictable branch speculatively executed */
+#define ARM_PMU_MEM_ACCESS                           0x0013             /*!< Data memory access */
+#define ARM_PMU_L1I_CACHE                            0x0014             /*!< Level 1 instruction cache access */
+#define ARM_PMU_L1D_CACHE_WB                         0x0015             /*!< Level 1 data cache write-back */
+#define ARM_PMU_L2D_CACHE                            0x0016             /*!< Level 2 data cache access */
+#define ARM_PMU_L2D_CACHE_REFILL                     0x0017             /*!< Level 2 data cache refill */
+#define ARM_PMU_L2D_CACHE_WB                         0x0018             /*!< Level 2 data cache write-back */
+#define ARM_PMU_BUS_ACCESS                           0x0019             /*!< Bus access */
+#define ARM_PMU_MEMORY_ERROR                         0x001A             /*!< Local memory error */
+#define ARM_PMU_INST_SPEC                            0x001B             /*!< Instruction speculatively executed */
+#define ARM_PMU_BUS_CYCLES                           0x001D             /*!< Bus cycles */
+#define ARM_PMU_CHAIN                                0x001E             /*!< For an odd numbered counter, increment when an overflow occurs on the preceding even-numbered counter on the same PE */
+#define ARM_PMU_L1D_CACHE_ALLOCATE                   0x001F             /*!< Level 1 data cache allocation without refill */
+#define ARM_PMU_L2D_CACHE_ALLOCATE                   0x0020             /*!< Level 2 data cache allocation without refill */
+#define ARM_PMU_BR_RETIRED                           0x0021             /*!< Branch instruction architecturally executed */
+#define ARM_PMU_BR_MIS_PRED_RETIRED                  0x0022             /*!< Mispredicted branch instruction architecturally executed */
+#define ARM_PMU_STALL_FRONTEND                       0x0023             /*!< No operation issued because of the frontend */
+#define ARM_PMU_STALL_BACKEND                        0x0024             /*!< No operation issued because of the backend */
+#define ARM_PMU_L2I_CACHE                            0x0027             /*!< Level 2 instruction cache access */
+#define ARM_PMU_L2I_CACHE_REFILL                     0x0028             /*!< Level 2 instruction cache refill */
+#define ARM_PMU_L3D_CACHE_ALLOCATE                   0x0029             /*!< Level 3 data cache allocation without refill */
+#define ARM_PMU_L3D_CACHE_REFILL                     0x002A             /*!< Level 3 data cache refill */
+#define ARM_PMU_L3D_CACHE                            0x002B             /*!< Level 3 data cache access */
+#define ARM_PMU_L3D_CACHE_WB                         0x002C             /*!< Level 3 data cache write-back */
+#define ARM_PMU_LL_CACHE_RD                          0x0036             /*!< Last level data cache read */
+#define ARM_PMU_LL_CACHE_MISS_RD                     0x0037             /*!< Last level data cache read miss */
+#define ARM_PMU_L1D_CACHE_MISS_RD                    0x0039             /*!< Level 1 data cache read miss */
+#define ARM_PMU_OP_COMPLETE                          0x003A             /*!< Operation retired */
+#define ARM_PMU_OP_SPEC                              0x003B             /*!< Operation speculatively executed */
+#define ARM_PMU_STALL                                0x003C             /*!< Stall cycle for instruction or operation not sent for execution */
+#define ARM_PMU_STALL_OP_BACKEND                     0x003D             /*!< Stall cycle for instruction or operation not sent for execution due to pipeline backend */
+#define ARM_PMU_STALL_OP_FRONTEND                    0x003E             /*!< Stall cycle for instruction or operation not sent for execution due to pipeline frontend */
+#define ARM_PMU_STALL_OP                             0x003F             /*!< Instruction or operation slots not occupied each cycle */
+#define ARM_PMU_L1D_CACHE_RD                         0x0040             /*!< Level 1 data cache read */
+#define ARM_PMU_LE_RETIRED                           0x0100             /*!< Loop end instruction executed */
+#define ARM_PMU_LE_SPEC                              0x0101             /*!< Loop end instruction speculatively executed */
+#define ARM_PMU_BF_RETIRED                           0x0104             /*!< Branch future instruction architecturally executed and condition code check pass */
+#define ARM_PMU_BF_SPEC                              0x0105             /*!< Branch future instruction speculatively executed and condition code check pass */
+#define ARM_PMU_LE_CANCEL                            0x0108             /*!< Loop end instruction not taken */
+#define ARM_PMU_BF_CANCEL                            0x0109             /*!< Branch future instruction not taken */
+#define ARM_PMU_SE_CALL_S                            0x0114             /*!< Call to secure function, resulting in Security state change */
+#define ARM_PMU_SE_CALL_NS                           0x0115             /*!< Call to non-secure function, resulting in Security state change */
+#define ARM_PMU_DWT_CMPMATCH0                        0x0118             /*!< DWT comparator 0 match */
+#define ARM_PMU_DWT_CMPMATCH1                        0x0119             /*!< DWT comparator 1 match */
+#define ARM_PMU_DWT_CMPMATCH2                        0x011A             /*!< DWT comparator 2 match */
+#define ARM_PMU_DWT_CMPMATCH3                        0x011B             /*!< DWT comparator 3 match */
+#define ARM_PMU_MVE_INST_RETIRED                     0x0200             /*!< MVE instruction architecturally executed */
+#define ARM_PMU_MVE_INST_SPEC                        0x0201             /*!< MVE instruction speculatively executed */
+#define ARM_PMU_MVE_FP_RETIRED                       0x0204             /*!< MVE floating-point instruction architecturally executed */
+#define ARM_PMU_MVE_FP_SPEC                          0x0205             /*!< MVE floating-point instruction speculatively executed */
+#define ARM_PMU_MVE_FP_HP_RETIRED                    0x0208             /*!< MVE half-precision floating-point instruction architecturally executed */
+#define ARM_PMU_MVE_FP_HP_SPEC                       0x0209             /*!< MVE half-precision floating-point instruction speculatively executed */
+#define ARM_PMU_MVE_FP_SP_RETIRED                    0x020C             /*!< MVE single-precision floating-point instruction architecturally executed */
+#define ARM_PMU_MVE_FP_SP_SPEC                       0x020D             /*!< MVE single-precision floating-point instruction speculatively executed */
+#define ARM_PMU_MVE_FP_MAC_RETIRED                   0x0214             /*!< MVE floating-point multiply or multiply-accumulate instruction architecturally executed */
+#define ARM_PMU_MVE_FP_MAC_SPEC                      0x0215             /*!< MVE floating-point multiply or multiply-accumulate instruction speculatively executed */
+#define ARM_PMU_MVE_INT_RETIRED                      0x0224             /*!< MVE integer instruction architecturally executed */
+#define ARM_PMU_MVE_INT_SPEC                         0x0225             /*!< MVE integer instruction speculatively executed */
+#define ARM_PMU_MVE_INT_MAC_RETIRED                  0x0228             /*!< MVE multiply or multiply-accumulate instruction architecturally executed */
+#define ARM_PMU_MVE_INT_MAC_SPEC                     0x0229             /*!< MVE multiply or multiply-accumulate instruction speculatively executed */
+#define ARM_PMU_MVE_LDST_RETIRED                     0x0238             /*!< MVE load or store instruction architecturally executed */
+#define ARM_PMU_MVE_LDST_SPEC                        0x0239             /*!< MVE load or store instruction speculatively executed */
+#define ARM_PMU_MVE_LD_RETIRED                       0x023C             /*!< MVE load instruction architecturally executed */
+#define ARM_PMU_MVE_LD_SPEC                          0x023D             /*!< MVE load instruction speculatively executed */
+#define ARM_PMU_MVE_ST_RETIRED                       0x0240             /*!< MVE store instruction architecturally executed */
+#define ARM_PMU_MVE_ST_SPEC                          0x0241             /*!< MVE store instruction speculatively executed */
+#define ARM_PMU_MVE_LDST_CONTIG_RETIRED              0x0244             /*!< MVE contiguous load or store instruction architecturally executed */
+#define ARM_PMU_MVE_LDST_CONTIG_SPEC                 0x0245             /*!< MVE contiguous load or store instruction speculatively executed */
+#define ARM_PMU_MVE_LD_CONTIG_RETIRED                0x0248             /*!< MVE contiguous load instruction architecturally executed */
+#define ARM_PMU_MVE_LD_CONTIG_SPEC                   0x0249             /*!< MVE contiguous load instruction speculatively executed */
+#define ARM_PMU_MVE_ST_CONTIG_RETIRED                0x024C             /*!< MVE contiguous store instruction architecturally executed */
+#define ARM_PMU_MVE_ST_CONTIG_SPEC                   0x024D             /*!< MVE contiguous store instruction speculatively executed */
+#define ARM_PMU_MVE_LDST_NONCONTIG_RETIRED           0x0250             /*!< MVE non-contiguous load or store instruction architecturally executed */
+#define ARM_PMU_MVE_LDST_NONCONTIG_SPEC              0x0251             /*!< MVE non-contiguous load or store instruction speculatively executed */
+#define ARM_PMU_MVE_LD_NONCONTIG_RETIRED             0x0254             /*!< MVE non-contiguous load instruction architecturally executed */
+#define ARM_PMU_MVE_LD_NONCONTIG_SPEC                0x0255             /*!< MVE non-contiguous load instruction speculatively executed */
+#define ARM_PMU_MVE_ST_NONCONTIG_RETIRED             0x0258             /*!< MVE non-contiguous store instruction architecturally executed */
+#define ARM_PMU_MVE_ST_NONCONTIG_SPEC                0x0259             /*!< MVE non-contiguous store instruction speculatively executed */
+#define ARM_PMU_MVE_LDST_MULTI_RETIRED               0x025C             /*!< MVE memory instruction targeting multiple registers architecturally executed */
+#define ARM_PMU_MVE_LDST_MULTI_SPEC                  0x025D             /*!< MVE memory instruction targeting multiple registers speculatively executed */
+#define ARM_PMU_MVE_LD_MULTI_RETIRED                 0x0260             /*!< MVE memory load instruction targeting multiple registers architecturally executed */
+#define ARM_PMU_MVE_LD_MULTI_SPEC                    0x0261             /*!< MVE memory load instruction targeting multiple registers speculatively executed */
+#define ARM_PMU_MVE_ST_MULTI_RETIRED                 0x0261             /*!< MVE memory store instruction targeting multiple registers architecturally executed */
+#define ARM_PMU_MVE_ST_MULTI_SPEC                    0x0265             /*!< MVE memory store instruction targeting multiple registers speculatively executed */
+#define ARM_PMU_MVE_LDST_UNALIGNED_RETIRED           0x028C             /*!< MVE unaligned memory load or store instruction architecturally executed */
+#define ARM_PMU_MVE_LDST_UNALIGNED_SPEC              0x028D             /*!< MVE unaligned memory load or store instruction speculatively executed */
+#define ARM_PMU_MVE_LD_UNALIGNED_RETIRED             0x0290             /*!< MVE unaligned load instruction architecturally executed */
+#define ARM_PMU_MVE_LD_UNALIGNED_SPEC                0x0291             /*!< MVE unaligned load instruction speculatively executed */
+#define ARM_PMU_MVE_ST_UNALIGNED_RETIRED             0x0294             /*!< MVE unaligned store instruction architecturally executed */
+#define ARM_PMU_MVE_ST_UNALIGNED_SPEC                0x0295             /*!< MVE unaligned store instruction speculatively executed */
+#define ARM_PMU_MVE_LDST_UNALIGNED_NONCONTIG_RETIRED 0x0298             /*!< MVE unaligned noncontiguous load or store instruction architecturally executed */
+#define ARM_PMU_MVE_LDST_UNALIGNED_NONCONTIG_SPEC    0x0299             /*!< MVE unaligned noncontiguous load or store instruction speculatively executed */
+#define ARM_PMU_MVE_VREDUCE_RETIRED                  0x02A0             /*!< MVE vector reduction instruction architecturally executed */
+#define ARM_PMU_MVE_VREDUCE_SPEC                     0x02A1             /*!< MVE vector reduction instruction speculatively executed */
+#define ARM_PMU_MVE_VREDUCE_FP_RETIRED               0x02A4             /*!< MVE floating-point vector reduction instruction architecturally executed */
+#define ARM_PMU_MVE_VREDUCE_FP_SPEC                  0x02A5             /*!< MVE floating-point vector reduction instruction speculatively executed */
+#define ARM_PMU_MVE_VREDUCE_INT_RETIRED              0x02A8             /*!< MVE integer vector reduction instruction architecturally executed */
+#define ARM_PMU_MVE_VREDUCE_INT_SPEC                 0x02A9             /*!< MVE integer vector reduction instruction speculatively executed */
+#define ARM_PMU_MVE_PRED                             0x02B8             /*!< Cycles where one or more predicated beats architecturally executed */
+#define ARM_PMU_MVE_STALL                            0x02CC             /*!< Stall cycles caused by an MVE instruction */
+#define ARM_PMU_MVE_STALL_RESOURCE                   0x02CD             /*!< Stall cycles caused by an MVE instruction because of resource conflicts */
+#define ARM_PMU_MVE_STALL_RESOURCE_MEM               0x02CE             /*!< Stall cycles caused by an MVE instruction because of memory resource conflicts */
+#define ARM_PMU_MVE_STALL_RESOURCE_FP                0x02CF             /*!< Stall cycles caused by an MVE instruction because of floating-point resource conflicts */
+#define ARM_PMU_MVE_STALL_RESOURCE_INT               0x02D0             /*!< Stall cycles caused by an MVE instruction because of integer resource conflicts */
+#define ARM_PMU_MVE_STALL_BREAK                      0x02D3             /*!< Stall cycles caused by an MVE chain break */
+#define ARM_PMU_MVE_STALL_DEPENDENCY                 0x02D4             /*!< Stall cycles caused by MVE register dependency */
+#define ARM_PMU_ITCM_ACCESS                          0x4007             /*!< Instruction TCM access */
+#define ARM_PMU_DTCM_ACCESS                          0x4008             /*!< Data TCM access */
+#define ARM_PMU_TRCEXTOUT0                           0x4010             /*!< ETM external output 0 */
+#define ARM_PMU_TRCEXTOUT1                           0x4011             /*!< ETM external output 1 */
+#define ARM_PMU_TRCEXTOUT2                           0x4012             /*!< ETM external output 2 */
+#define ARM_PMU_TRCEXTOUT3                           0x4013             /*!< ETM external output 3 */
+#define ARM_PMU_CTI_TRIGOUT4                         0x4018             /*!< Cross-trigger Interface output trigger 4 */
+#define ARM_PMU_CTI_TRIGOUT5                         0x4019             /*!< Cross-trigger Interface output trigger 5 */
+#define ARM_PMU_CTI_TRIGOUT6                         0x401A             /*!< Cross-trigger Interface output trigger 6 */
+#define ARM_PMU_CTI_TRIGOUT7                         0x401B             /*!< Cross-trigger Interface output trigger 7 */
+
+/** \brief PMU Functions */
+
+__STATIC_INLINE void ARM_PMU_Enable(void);
+__STATIC_INLINE void ARM_PMU_Disable(void);
+
+__STATIC_INLINE void ARM_PMU_Set_EVTYPER(uint32_t num, uint32_t type);
+
+__STATIC_INLINE void ARM_PMU_CYCCNT_Reset(void);
+__STATIC_INLINE void ARM_PMU_EVCNTR_ALL_Reset(void);
+
+__STATIC_INLINE void ARM_PMU_CNTR_Enable(uint32_t mask);
+__STATIC_INLINE void ARM_PMU_CNTR_Disable(uint32_t mask);
+
+__STATIC_INLINE uint32_t ARM_PMU_Get_CCNTR(void);
+__STATIC_INLINE uint32_t ARM_PMU_Get_EVCNTR(uint32_t num);
+
+__STATIC_INLINE uint32_t ARM_PMU_Get_CNTR_OVS(void);
+__STATIC_INLINE void ARM_PMU_Set_CNTR_OVS(uint32_t mask);
+
+__STATIC_INLINE void ARM_PMU_Set_CNTR_IRQ_Enable(uint32_t mask);
+__STATIC_INLINE void ARM_PMU_Set_CNTR_IRQ_Disable(uint32_t mask);
+
+__STATIC_INLINE void ARM_PMU_CNTR_Increment(uint32_t mask);
+
+/**
+  \brief   Enable the PMU
+*/
+__STATIC_INLINE void ARM_PMU_Enable(void)
+{
+  PMU->CTRL |= PMU_CTRL_ENABLE_Msk;
+}
+
+/**
+  \brief   Disable the PMU
+*/
+__STATIC_INLINE void ARM_PMU_Disable(void)
+{
+  PMU->CTRL &= ~PMU_CTRL_ENABLE_Msk;
+}
+
+/**
+  \brief   Set event to count for PMU eventer counter
+  \param [in]    num     Event counter (0-30) to configure
+  \param [in]    type    Event to count
+*/
+__STATIC_INLINE void ARM_PMU_Set_EVTYPER(uint32_t num, uint32_t type)
+{
+  PMU->EVTYPER[num] = type;
+}
+
+/**
+  \brief  Reset cycle counter
+*/
+__STATIC_INLINE void ARM_PMU_CYCCNT_Reset(void)
+{
+  PMU->CTRL |= PMU_CTRL_CYCCNT_RESET_Msk;
+}
+
+/**
+  \brief  Reset all event counters
+*/
+__STATIC_INLINE void ARM_PMU_EVCNTR_ALL_Reset(void)
+{
+  PMU->CTRL |= PMU_CTRL_EVENTCNT_RESET_Msk;
+}
+
+/**
+  \brief  Enable counters
+  \param [in]     mask    Counters to enable
+  \note   Enables one or more of the following:
+          - event counters (0-30)
+          - cycle counter
+*/
+__STATIC_INLINE void ARM_PMU_CNTR_Enable(uint32_t mask)
+{
+  PMU->CNTENSET = mask;
+}
+
+/**
+  \brief  Disable counters
+  \param [in]     mask    Counters to enable
+  \note   Disables one or more of the following:
+          - event counters (0-30)
+          - cycle counter
+*/
+__STATIC_INLINE void ARM_PMU_CNTR_Disable(uint32_t mask)
+{
+  PMU->CNTENCLR = mask;
+}
+
+/**
+  \brief  Read cycle counter
+  \return                 Cycle count
+*/
+__STATIC_INLINE uint32_t ARM_PMU_Get_CCNTR(void)
+{
+  return PMU->CCNTR;
+}
+
+/**
+  \brief   Read event counter
+  \param [in]     num     Event counter (0-30) to read
+  \return                 Event count
+*/
+__STATIC_INLINE uint32_t ARM_PMU_Get_EVCNTR(uint32_t num)
+{
+  return PMU->EVCNTR[num];
+}
+
+/**
+  \brief   Read counter overflow status
+  \return  Counter overflow status bits for the following:
+          - event counters (0-30)
+          - cycle counter
+*/
+__STATIC_INLINE uint32_t ARM_PMU_Get_CNTR_OVS(void)
+{
+  return PMU->OVSSET;
+}
+
+/**
+  \brief   Clear counter overflow status
+  \param [in]     mask    Counter overflow status bits to clear
+  \note    Clears overflow status bits for one or more of the following:
+           - event counters (0-30)
+           - cycle counter
+*/
+__STATIC_INLINE void ARM_PMU_Set_CNTR_OVS(uint32_t mask)
+{
+  PMU->OVSCLR = mask;
+}
+
+/**
+  \brief   Enable counter overflow interrupt request
+  \param [in]     mask    Counter overflow interrupt request bits to set
+  \note    Sets overflow interrupt request bits for one or more of the following:
+           - event counters (0-30)
+           - cycle counter
+*/
+__STATIC_INLINE void ARM_PMU_Set_CNTR_IRQ_Enable(uint32_t mask)
+{
+  PMU->INTENSET = mask;
+}
+
+/**
+  \brief   Disable counter overflow interrupt request
+  \param [in]     mask    Counter overflow interrupt request bits to clear
+  \note    Clears overflow interrupt request bits for one or more of the following:
+           - event counters (0-30)
+           - cycle counter
+*/
+__STATIC_INLINE void ARM_PMU_Set_CNTR_IRQ_Disable(uint32_t mask)
+{
+  PMU->INTENCLR = mask;
+}
+
+/**
+  \brief   Software increment event counter
+  \param [in]     mask    Counters to increment
+  \note    Software increment bits for one or more event counters (0-30)
+*/
+__STATIC_INLINE void ARM_PMU_CNTR_Increment(uint32_t mask)
+{
+  PMU->SWINC = mask;
+}
+
+#endif

+ 419 - 0
libraries/cmsis/cm4/device_support/at32f413.h

@@ -0,0 +1,419 @@
+/**
+  **************************************************************************
+  * @file     at32f413.h
+  * @brief    at32f413 header file
+  **************************************************************************
+  *                       Copyright notice & Disclaimer
+  *
+  * The software Board Support Package (BSP) that is made available to
+  * download from Artery official website is the copyrighted work of Artery.
+  * Artery authorizes customers to use, copy, and distribute the BSP
+  * software and its related documentation for the purpose of design and
+  * development in conjunction with Artery microcontrollers. Use of the
+  * software is governed by this copyright notice and the following disclaimer.
+  *
+  * THIS SOFTWARE IS PROVIDED ON "AS IS" BASIS WITHOUT WARRANTIES,
+  * GUARANTEES OR REPRESENTATIONS OF ANY KIND. ARTERY EXPRESSLY DISCLAIMS,
+  * TO THE FULLEST EXTENT PERMITTED BY LAW, ALL EXPRESS, IMPLIED OR
+  * STATUTORY OR OTHER WARRANTIES, GUARANTEES OR REPRESENTATIONS,
+  * INCLUDING BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
+  *
+  **************************************************************************
+  */
+
+#ifndef __AT32F413_H
+#define __AT32F413_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined (__CC_ARM)
+ #pragma anon_unions
+#endif
+
+/** @addtogroup CMSIS
+  * @{
+  */
+
+/** @addtogroup AT32F413
+  * @{
+  */
+
+/** @addtogroup Library_configuration_section
+  * @{
+  */
+
+/**
+  * tip: to avoid modifying this file each time you need to switch between these
+  *      devices, you can define the device in your toolchain compiler preprocessor.
+  */
+
+#if !defined (AT32F413KBU7_4) && !defined (AT32F413KCU7_4) && !defined (AT32F413CBU7) && \
+    !defined (AT32F413CCU7)   && !defined (AT32F413C8T7)   && !defined (AT32F413CBT7) && \
+    !defined (AT32F413CCT7)   && !defined (AT32F413RBT7)   && !defined (AT32F413RCT7) && \
+    !defined (AT32FEBKC8T7)   && !defined (AT32F413TBU7)
+
+    #error "Please select first the target device used in your application (in at32f413.h file)"
+#endif
+
+#if defined (AT32F413KBU7_4) || defined (AT32F413KCU7_4) || defined (AT32F413CBU7) || \
+    defined (AT32F413CCU7)   || defined (AT32F413C8T7)   || defined (AT32F413CBT7) || \
+    defined (AT32F413CCT7)   || defined (AT32F413RBT7)   || defined (AT32F413RCT7) || \
+    defined (AT32FEBKC8T7)   || defined (AT32F413TBU7)
+
+    #define AT32F413xx
+#endif
+
+/**
+  * define with package
+  */
+#if defined (AT32F413RBT7)   || defined (AT32F413RCT7)
+
+    #define AT32F413Rx
+#endif
+
+#if defined (AT32F413CBU7)   || defined (AT32F413CCU7)   || defined (AT32F413C8T7) || \
+    defined (AT32F413CBT7)   || defined (AT32F413CCT7)
+
+    #define AT32F413Cx
+#endif
+
+#if defined (AT32F413KBU7_4) || defined (AT32F413KCU7_4)
+
+    #define AT32F413Kx
+#endif
+
+/**
+  * define with memory density
+  */
+#if defined (AT32F413C8T7)   || defined (AT32FEBKC8T7)
+
+    #define AT32F413x8
+#endif
+
+#if defined (AT32F413KBU7_4) || defined (AT32F413CBU7)   || defined (AT32F413CBT7) || \
+    defined (AT32F413RBT7)   || defined (AT32F413TBU7)
+
+    #define AT32F413xB
+#endif
+
+#if defined (AT32F413KCU7_4) || defined (AT32F413CCU7)   || defined (AT32F413CCT7)  || \
+    defined (AT32F413RCT7)
+
+    #define AT32F413xC
+#endif
+
+#ifndef USE_STDPERIPH_DRIVER
+/**
+  * @brief comment the line below if you will not use the peripherals drivers.
+  * in this case, these drivers will not be included and the application code will
+  * be based on direct access to peripherals registers
+  */
+  #ifdef _RTE_
+    #include "RTE_Components.h"
+    #ifdef RTE_DEVICE_STDPERIPH_FRAMEWORK
+      #define USE_STDPERIPH_DRIVER
+    #endif
+  #endif
+#endif
+
+/**
+  * @brief at32f413 standard peripheral library version number
+  */
+#define __AT32F413_LIBRARY_VERSION_MAJOR    (0x02) /*!< [31:24] major version */
+#define __AT32F413_LIBRARY_VERSION_MIDDLE   (0x01) /*!< [23:16] middle version */
+#define __AT32F413_LIBRARY_VERSION_MINOR    (0x03) /*!< [15:8]  minor version */
+#define __AT32F413_LIBRARY_VERSION_RC       (0x00) /*!< [7:0]  release candidate */
+#define __AT32F413_LIBRARY_VERSION          ((__AT32F413_LIBRARY_VERSION_MAJOR << 24)  | \
+                                             (__AT32F413_LIBRARY_VERSION_MIDDLE << 16) | \
+                                             (__AT32F413_LIBRARY_VERSION_MINOR << 8)   | \
+                                             (__AT32F413_LIBRARY_VERSION_RC))
+
+/**
+  * @}
+  */
+
+/** @addtogroup Configuration_section_for_CMSIS
+  * @{
+  */
+
+/**
+  * @brief configuration of the cortex-m4 processor and core peripherals
+  */
+#define __CM4_REV                 0x0001U  /*!< core revision r0p1                           */
+#define __MPU_PRESENT             1        /*!< mpu present                                  */
+#define __NVIC_PRIO_BITS          4        /*!< at32 uses 4 bits for the priority levels     */
+#define __Vendor_SysTickConfig    0        /*!< set to 1 if different systick config is used */
+#define __FPU_PRESENT             1U       /*!< fpu present                                  */
+
+/**
+  * @brief at32f413 interrupt number definition, according to the selected device
+  *        in @ref Library_configuration_section
+  */
+typedef enum IRQn
+{
+    /******  cortex-m4 processor exceptions numbers ***************************************************/
+    Reset_IRQn                  = -15,    /*!< 1 reset vector, invoked on power up and warm reset   */
+    NonMaskableInt_IRQn         = -14,    /*!< 2 non maskable interrupt                             */
+    HardFault_IRQn              = -13,    /*!< 3 hard fault, all classes of fault                   */
+    MemoryManagement_IRQn       = -12,    /*!< 4 cortex-m4 memory management interrupt              */
+    BusFault_IRQn               = -11,    /*!< 5 cortex-m4 bus fault interrupt                      */
+    UsageFault_IRQn             = -10,    /*!< 6 cortex-m4 usage fault interrupt                    */
+    SVCall_IRQn                 = -5,     /*!< 11 cortex-m4 sv call interrupt                       */
+    DebugMonitor_IRQn           = -4,     /*!< 12 cortex-m4 debug monitor interrupt                 */
+    PendSV_IRQn                 = -2,     /*!< 14 cortex-m4 pend sv interrupt                       */
+    SysTick_IRQn                = -1,     /*!< 15 cortex-m4 system tick interrupt                   */
+
+    /******  at32 specific interrupt numbers *********************************************************/
+    WWDT_IRQn                   = 0,      /*!< window watchdog timer interrupt                      */
+    PVM_IRQn                    = 1,      /*!< pvm through exint line detection interrupt           */
+    TAMPER_IRQn                 = 2,      /*!< tamper interrupt                                     */
+    RTC_IRQn                    = 3,      /*!< rtc global interrupt                                 */
+    FLASH_IRQn                  = 4,      /*!< flash global interrupt                               */
+    CRM_IRQn                    = 5,      /*!< crm global interrupt                                 */
+    EXINT0_IRQn                 = 6,      /*!< external line0 interrupt                             */
+    EXINT1_IRQn                 = 7,      /*!< external line1 interrupt                             */
+    EXINT2_IRQn                 = 8,      /*!< external line2 interrupt                             */
+    EXINT3_IRQn                 = 9,      /*!< external line3 interrupt                             */
+    EXINT4_IRQn                 = 10,     /*!< external line4 interrupt                             */
+    DMA1_Channel1_IRQn          = 11,     /*!< dma1 channel 1 global interrupt                      */
+    DMA1_Channel2_IRQn          = 12,     /*!< dma1 channel 2 global interrupt                      */
+    DMA1_Channel3_IRQn          = 13,     /*!< dma1 channel 3 global interrupt                      */
+    DMA1_Channel4_IRQn          = 14,     /*!< dma1 channel 4 global interrupt                      */
+    DMA1_Channel5_IRQn          = 15,     /*!< dma1 channel 5 global interrupt                      */
+    DMA1_Channel6_IRQn          = 16,     /*!< dma1 channel 6 global interrupt                      */
+    DMA1_Channel7_IRQn          = 17,     /*!< dma1 channel 7 global interrupt                      */
+
+    ADC1_2_IRQn                 = 18,     /*!< adc1 and adc2 global interrupt                       */
+    USBFS_H_CAN1_TX_IRQn        = 19,     /*!< usb device high priority or can1 tx interrupts       */
+    USBFS_L_CAN1_RX0_IRQn       = 20,     /*!< usb device low priority or can1 rx0 interrupts       */
+    CAN1_RX1_IRQn               = 21,     /*!< can1 rx1 interrupt                                   */
+    CAN1_SE_IRQn                = 22,     /*!< can1 se interrupt                                    */
+    EXINT9_5_IRQn               = 23,     /*!< external line[9:5] interrupts                        */
+    TMR1_BRK_TMR9_IRQn          = 24,     /*!< tmr1 brake interrupt                                 */
+    TMR1_OVF_TMR10_IRQn         = 25,     /*!< tmr1 overflow interrupt                              */
+    TMR1_TRG_HALL_TMR11_IRQn    = 26,     /*!< tmr1 trigger and hall interrupt                      */
+    TMR1_CH_IRQn                = 27,     /*!< tmr1 channel interrupt                               */
+    TMR2_GLOBAL_IRQn            = 28,     /*!< tmr2 global interrupt                                */
+    TMR3_GLOBAL_IRQn            = 29,     /*!< tmr3 global interrupt                                */
+    TMR4_GLOBAL_IRQn            = 30,     /*!< tmr4 global interrupt                                */
+    I2C1_EVT_IRQn               = 31,     /*!< i2c1 event interrupt                                 */
+    I2C1_ERR_IRQn               = 32,     /*!< i2c1 error interrupt                                 */
+    I2C2_EVT_IRQn               = 33,     /*!< i2c2 event interrupt                                 */
+    I2C2_ERR_IRQn               = 34,     /*!< i2c2 error interrupt                                 */
+    SPI1_IRQn                   = 35,     /*!< spi1 global interrupt                                */
+    SPI2_IRQn                   = 36,     /*!< spi2 global interrupt                                */
+    USART1_IRQn                 = 37,     /*!< usart1 global interrupt                              */
+    USART2_IRQn                 = 38,     /*!< usart2 global interrupt                              */
+    USART3_IRQn                 = 39,     /*!< usart3 global interrupt                              */
+    EXINT15_10_IRQn             = 40,     /*!< external line[15:10] interrupts                      */
+    RTCAlarm_IRQn               = 41,     /*!< rtc alarm through exint line interrupt               */
+    USBFSWakeUp_IRQn            = 42,     /*!< usb device wakeup from suspend through exint line interrupt */
+    TMR8_BRK_IRQn               = 43,     /*!< tmr8 brake interrupt                                 */
+    TMR8_OVF_IRQn               = 44,     /*!< tmr8 overflow interrupt                              */
+    TMR8_TRG_HALL_IRQn          = 45,     /*!< tmr8 trigger and hall interrupt                      */
+    TMR8_CH_IRQn                = 46,     /*!< tmr8 channel interrupt                               */
+    SDIO1_IRQn                  = 49,     /*!< sdio1 global interrupt                               */
+    TMR5_GLOBAL_IRQn            = 50,     /*!< tmr5 global interrupt                                */
+    UART4_IRQn                  = 52,     /*!< uart4 global interrupt                               */
+    UART5_IRQn                  = 53,     /*!< uart5 global interrupt                               */
+    DMA2_Channel1_IRQn          = 56,     /*!< dma2 channel 1 global interrupt                      */
+    DMA2_Channel2_IRQn          = 57,     /*!< dma2 channel 2 global interrupt                      */
+    DMA2_Channel3_IRQn          = 58,     /*!< dma2 channel 3 global interrupt                      */
+    DMA2_Channel4_5_IRQn        = 59,     /*!< dma2 channel 4 and channel 5 global interrupt        */
+    CAN2_TX_IRQn                = 68,     /*!< can2 tx interrupt                                    */
+    CAN2_RX0_IRQn               = 69,     /*!< can2 rx0 interrupt                                   */
+    CAN2_RX1_IRQn               = 70,     /*!< can2 rx1 interrupt                                   */
+    CAN2_SE_IRQn                = 71,     /*!< can2 se interrupt                                    */
+    ACC_IRQn                    = 72,     /*!< acc interrupt                                        */
+    USBFS_MAPH_IRQn             = 73,     /*!< usb map hp interrupt                                 */
+    USBFS_MAPL_IRQn             = 74,     /*!< usb map lp interrupt                                 */
+    DMA2_Channel6_7_IRQn        = 75      /*!< dma2 channel 6 and channel 7 global interrupt        */
+
+} IRQn_Type;
+
+/**
+  * @}
+  */
+
+#include "core_cm4.h"
+#include "system_at32f413.h"
+#include <stdint.h>
+
+/** @addtogroup Exported_types
+  * @{
+  */
+
+typedef int32_t  INT32;
+typedef int16_t  INT16;
+typedef int8_t   INT8;
+typedef uint32_t UINT32;
+typedef uint16_t UINT16;
+typedef uint8_t  UINT8;
+
+typedef int32_t  s32;
+typedef int16_t  s16;
+typedef int8_t   s8;
+
+typedef const int32_t sc32;   /*!< read only */
+typedef const int16_t sc16;   /*!< read only */
+typedef const int8_t  sc8;    /*!< read only */
+
+typedef __IO int32_t  vs32;
+typedef __IO int16_t  vs16;
+typedef __IO int8_t   vs8;
+
+typedef __I int32_t vsc32;    /*!< read only */
+typedef __I int16_t vsc16;    /*!< read only */
+typedef __I int8_t  vsc8;     /*!< read only */
+
+typedef uint32_t u32;
+typedef uint16_t u16;
+typedef uint8_t  u8;
+
+typedef const uint32_t uc32;  /*!< read only */
+typedef const uint16_t uc16;  /*!< read only */
+typedef const uint8_t  uc8;   /*!< read only */
+
+typedef __IO uint32_t vu32;
+typedef __IO uint16_t vu16;
+typedef __IO uint8_t  vu8;
+
+typedef __I uint32_t vuc32;   /*!< read only */
+typedef __I uint16_t vuc16;   /*!< read only */
+typedef __I uint8_t  vuc8;    /*!< read only */
+
+/**
+  * @brief flag status
+  */
+typedef enum {RESET = 0, SET = !RESET} flag_status;
+
+/**
+  * @brief confirm state
+  */
+typedef enum {FALSE = 0, TRUE = !FALSE} confirm_state;
+
+/**
+  * @brief error status
+  */
+typedef enum {ERROR = 0, SUCCESS = !ERROR} error_status;
+
+/**
+  * @}
+  */
+
+/** @addtogroup Exported_macro
+  * @{
+  */
+
+#define REG8(addr)                       *(volatile uint8_t *)(addr)
+#define REG16(addr)                      *(volatile uint16_t *)(addr)
+#define REG32(addr)                      *(volatile uint32_t *)(addr)
+
+#define MAKE_VALUE(reg_offset, bit_num)  (uint32_t)(((reg_offset) << 16) | (bit_num & 0x1F))
+
+#define PERIPH_REG(periph_base, value)   REG32((periph_base + (value >> 16)))
+#define PERIPH_REG_BIT(value)            (0x1U << (value & 0x1F))
+
+/**
+  * @}
+  */
+
+/** @addtogroup Peripheral_memory_map
+  * @{
+  */
+
+#define FLASH_BASE                       ((uint32_t)0x08000000)
+#define SPIM_FLASH_BASE                  ((uint32_t)0x08400000)
+#define USD_BASE                         ((uint32_t)0x1FFFF800)
+#define SRAM_BASE                        ((uint32_t)0x20000000)
+#define PERIPH_BASE                      ((uint32_t)0x40000000)
+#define XMC_REG_BASE                     ((uint32_t)0xA0000000)
+#define DEBUG_BASE                       ((uint32_t)0xE0042000)
+
+#define APB1PERIPH_BASE                  (PERIPH_BASE + 0x00000)
+#define APB2PERIPH_BASE                  (PERIPH_BASE + 0x10000)
+#define AHBPERIPH_BASE                   (PERIPH_BASE + 0x20000)
+
+/* apb1 bus base address */
+#define TMR2_BASE                        (APB1PERIPH_BASE + 0x0000)
+#define TMR3_BASE                        (APB1PERIPH_BASE + 0x0400)
+#define TMR4_BASE                        (APB1PERIPH_BASE + 0x0800)
+#define TMR5_BASE                        (APB1PERIPH_BASE + 0x0C00)
+#define RTC_BASE                         (APB1PERIPH_BASE + 0x2800)
+#define WWDT_BASE                        (APB1PERIPH_BASE + 0x2C00)
+#define WDT_BASE                         (APB1PERIPH_BASE + 0x3000)
+#define SPI2_BASE                        (APB1PERIPH_BASE + 0x3800)
+#define USART2_BASE                      (APB1PERIPH_BASE + 0x4400)
+#define USART3_BASE                      (APB1PERIPH_BASE + 0x4800)
+#define UART4_BASE                       (APB1PERIPH_BASE + 0x4C00)
+#define UART5_BASE                       (APB1PERIPH_BASE + 0x5000)
+#define I2C1_BASE                        (APB1PERIPH_BASE + 0x5400)
+#define I2C2_BASE                        (APB1PERIPH_BASE + 0x5800)
+#define USBFS_BASE                       (APB1PERIPH_BASE + 0x5C00)
+#define CAN1_BASE                        (APB1PERIPH_BASE + 0x6400)
+#define CAN2_BASE                        (APB1PERIPH_BASE + 0x6800)
+#define BPR_BASE                         (APB1PERIPH_BASE + 0x6C00)
+#define PWC_BASE                         (APB1PERIPH_BASE + 0x7000)
+/* apb2 bus base address */
+#define IOMUX_BASE                       (APB2PERIPH_BASE + 0x0000)
+#define EXINT_BASE                       (APB2PERIPH_BASE + 0x0400)
+#define GPIOA_BASE                       (APB2PERIPH_BASE + 0x0800)
+#define GPIOB_BASE                       (APB2PERIPH_BASE + 0x0C00)
+#define GPIOC_BASE                       (APB2PERIPH_BASE + 0x1000)
+#define GPIOD_BASE                       (APB2PERIPH_BASE + 0x1400)
+#define GPIOF_BASE                       (APB2PERIPH_BASE + 0x1C00)
+#define ADC1_BASE                        (APB2PERIPH_BASE + 0x2400)
+#define ADC2_BASE                        (APB2PERIPH_BASE + 0x2800)
+#define TMR1_BASE                        (APB2PERIPH_BASE + 0x2C00)
+#define SPI1_BASE                        (APB2PERIPH_BASE + 0x3000)
+#define TMR8_BASE                        (APB2PERIPH_BASE + 0x3400)
+#define USART1_BASE                      (APB2PERIPH_BASE + 0x3800)
+#define TMR9_BASE                        (APB2PERIPH_BASE + 0x4C00)
+#define TMR10_BASE                       (APB2PERIPH_BASE + 0x5000)
+#define TMR11_BASE                       (APB2PERIPH_BASE + 0x5400)
+#define ACC_BASE                         (APB2PERIPH_BASE + 0x5800)
+#define SDIO1_BASE                       (APB2PERIPH_BASE + 0x8000)
+/* ahb bus base address */
+#define DMA1_BASE                        (AHBPERIPH_BASE + 0x0000)
+#define DMA1_CHANNEL1_BASE               (AHBPERIPH_BASE + 0x0008)
+#define DMA1_CHANNEL2_BASE               (AHBPERIPH_BASE + 0x001C)
+#define DMA1_CHANNEL3_BASE               (AHBPERIPH_BASE + 0x0030)
+#define DMA1_CHANNEL4_BASE               (AHBPERIPH_BASE + 0x0044)
+#define DMA1_CHANNEL5_BASE               (AHBPERIPH_BASE + 0x0058)
+#define DMA1_CHANNEL6_BASE               (AHBPERIPH_BASE + 0x006C)
+#define DMA1_CHANNEL7_BASE               (AHBPERIPH_BASE + 0x0080)
+#define DMA2_BASE                        (AHBPERIPH_BASE + 0x0400)
+#define DMA2_CHANNEL1_BASE               (AHBPERIPH_BASE + 0x0408)
+#define DMA2_CHANNEL2_BASE               (AHBPERIPH_BASE + 0x041C)
+#define DMA2_CHANNEL3_BASE               (AHBPERIPH_BASE + 0x0430)
+#define DMA2_CHANNEL4_BASE               (AHBPERIPH_BASE + 0x0444)
+#define DMA2_CHANNEL5_BASE               (AHBPERIPH_BASE + 0x0458)
+#define DMA2_CHANNEL6_BASE               (AHBPERIPH_BASE + 0x046C)
+#define DMA2_CHANNEL7_BASE               (AHBPERIPH_BASE + 0x0480)
+#define CRM_BASE                         (AHBPERIPH_BASE + 0x1000)
+#define FLASH_REG_BASE                   (AHBPERIPH_BASE + 0x2000)
+#define CRC_BASE                         (AHBPERIPH_BASE + 0x3000)
+
+/**
+  * @}
+  */
+
+/**
+  * @}
+  */
+
+/**
+  * @}
+  */
+
+#include "at32f413_def.h"
+#include "at32f413_conf.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif

+ 150 - 0
libraries/cmsis/cm4/device_support/at32f413_conf_template.h

@@ -0,0 +1,150 @@
+/**
+  **************************************************************************
+  * @file     at32f413_conf.h
+  * @brief    at32f413 config header file
+  **************************************************************************
+  *                       Copyright notice & Disclaimer
+  *
+  * The software Board Support Package (BSP) that is made available to
+  * download from Artery official website is the copyrighted work of Artery.
+  * Artery authorizes customers to use, copy, and distribute the BSP
+  * software and its related documentation for the purpose of design and
+  * development in conjunction with Artery microcontrollers. Use of the
+  * software is governed by this copyright notice and the following disclaimer.
+  *
+  * THIS SOFTWARE IS PROVIDED ON "AS IS" BASIS WITHOUT WARRANTIES,
+  * GUARANTEES OR REPRESENTATIONS OF ANY KIND. ARTERY EXPRESSLY DISCLAIMS,
+  * TO THE FULLEST EXTENT PERMITTED BY LAW, ALL EXPRESS, IMPLIED OR
+  * STATUTORY OR OTHER WARRANTIES, GUARANTEES OR REPRESENTATIONS,
+  * INCLUDING BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
+  *
+  **************************************************************************
+  */
+
+/* define to prevent recursive inclusion -------------------------------------*/
+#ifndef __AT32F413_CONF_H
+#define __AT32F413_CONF_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/**
+  * @brief in the following line adjust the value of high speed external crystal (hext)
+  * used in your application
+  * tip: to avoid modifying this file each time you need to use different hext, you
+  *      can define the hext value in your toolchain compiler preprocessor.
+  */
+#if !defined  HEXT_VALUE
+#define HEXT_VALUE               ((uint32_t)8000000) /*!< value of the high speed external crystal in hz */
+#endif
+
+/**
+  * @brief in the following line adjust the high speed external crystal (hext) startup
+  * timeout value
+  */
+#define HEXT_STARTUP_TIMEOUT             ((uint16_t)0x3000)  /*!< time out for hext start up */
+#define HICK_VALUE                       ((uint32_t)8000000) /*!< value of the high speed internal clock in hz */
+#define LEXT_VALUE                       ((uint32_t)32768)   /*!< value of the low speed external clock in hz */
+
+/* module define -------------------------------------------------------------*/
+#define CRM_MODULE_ENABLED
+#define TMR_MODULE_ENABLED
+#define RTC_MODULE_ENABLED
+#define BPR_MODULE_ENABLED
+#define GPIO_MODULE_ENABLED
+#define I2C_MODULE_ENABLED
+#define USART_MODULE_ENABLED
+#define PWC_MODULE_ENABLED
+#define CAN_MODULE_ENABLED
+#define ADC_MODULE_ENABLED
+#define SPI_MODULE_ENABLED
+#define DMA_MODULE_ENABLED
+#define DEBUG_MODULE_ENABLED
+#define FLASH_MODULE_ENABLED
+#define CRC_MODULE_ENABLED
+#define WWDT_MODULE_ENABLED
+#define WDT_MODULE_ENABLED
+#define EXINT_MODULE_ENABLED
+#define SDIO_MODULE_ENABLED
+#define USB_MODULE_ENABLED
+#define ACC_MODULE_ENABLED
+#define MISC_MODULE_ENABLED
+
+/* includes ------------------------------------------------------------------*/
+#ifdef CRM_MODULE_ENABLED
+#include "at32f413_crm.h"
+#endif
+#ifdef TMR_MODULE_ENABLED
+#include "at32f413_tmr.h"
+#endif
+#ifdef RTC_MODULE_ENABLED
+#include "at32f413_rtc.h"
+#endif
+#ifdef BPR_MODULE_ENABLED
+#include "at32f413_bpr.h"
+#endif
+#ifdef GPIO_MODULE_ENABLED
+#include "at32f413_gpio.h"
+#endif
+#ifdef I2C_MODULE_ENABLED
+#include "at32f413_i2c.h"
+#endif
+#ifdef USART_MODULE_ENABLED
+#include "at32f413_usart.h"
+#endif
+#ifdef PWC_MODULE_ENABLED
+#include "at32f413_pwc.h"
+#endif
+#ifdef CAN_MODULE_ENABLED
+#include "at32f413_can.h"
+#endif
+#ifdef ADC_MODULE_ENABLED
+#include "at32f413_adc.h"
+#endif
+#ifdef SPI_MODULE_ENABLED
+#include "at32f413_spi.h"
+#endif
+#ifdef DMA_MODULE_ENABLED
+#include "at32f413_dma.h"
+#endif
+#ifdef DEBUG_MODULE_ENABLED
+#include "at32f413_debug.h"
+#endif
+#ifdef FLASH_MODULE_ENABLED
+#include "at32f413_flash.h"
+#endif
+#ifdef CRC_MODULE_ENABLED
+#include "at32f413_crc.h"
+#endif
+#ifdef WWDT_MODULE_ENABLED
+#include "at32f413_wwdt.h"
+#endif
+#ifdef WDT_MODULE_ENABLED
+#include "at32f413_wdt.h"
+#endif
+#ifdef EXINT_MODULE_ENABLED
+#include "at32f413_exint.h"
+#endif
+#ifdef SDIO_MODULE_ENABLED
+#include "at32f413_sdio.h"
+#endif
+#ifdef ACC_MODULE_ENABLED
+#include "at32f413_acc.h"
+#endif
+#ifdef MISC_MODULE_ENABLED
+#include "at32f413_misc.h"
+#endif
+#ifdef USB_MODULE_ENABLED
+#include "at32f413_usb.h"
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __AT32F413_CONF_H */
+
+

+ 168 - 0
libraries/cmsis/cm4/device_support/startup/gcc/linker/AT32F413x8_FLASH.ld

@@ -0,0 +1,168 @@
+/*
+*****************************************************************************
+**
+**  File        : AT32F413x8_FLASH.ld
+**
+**  Abstract    : Linker script for AT32F413x8 Device with
+**                64KByte FLASH, 32KByte RAM
+**
+**                Set heap size, stack size and stack location according
+**                to application requirements.
+**
+**                Set memory bank area and size if external memory is used.
+**
+**  Target      : Artery Tek AT32
+**
+**  Environment : Arm gcc toolchain
+**
+*****************************************************************************
+*/
+
+/* Entry Point */
+ENTRY(Reset_Handler)
+
+/* Highest address of the user mode stack */
+_estack = 0x20008000;    /* end of RAM */
+
+/* Generate a link error if heap and stack don't fit into RAM */
+_Min_Heap_Size = 0x200;      /* required amount of heap  */
+_Min_Stack_Size = 0x400; /* required amount of stack */
+
+/* Specify the memory areas */
+MEMORY
+{
+FLASH (rx)      : ORIGIN = 0x08000000, LENGTH = 64K
+RAM (xrw)       : ORIGIN = 0x20000000, LENGTH = 32K
+SPIM (rx)       : ORIGIN = 0x08400000, LENGTH = 16384K
+}
+
+/* Define output sections */
+SECTIONS
+{
+  /* The startup code goes first into FLASH */
+  .isr_vector :
+  {
+    . = ALIGN(4);
+    KEEP(*(.isr_vector)) /* Startup code */
+    . = ALIGN(4);
+  } >FLASH
+
+  /* The program code and other data goes into FLASH */
+  .text :
+  {
+    . = ALIGN(4);
+    *(.text)           /* .text sections (code) */
+    *(.text*)          /* .text* sections (code) */
+    *(.glue_7)         /* glue arm to thumb code */
+    *(.glue_7t)        /* glue thumb to arm code */
+    *(.eh_frame)
+
+    KEEP (*(.init))
+    KEEP (*(.fini))
+
+    . = ALIGN(4);
+    _etext = .;        /* define a global symbols at end of code */
+  } >FLASH
+
+  /* Constant data goes into FLASH */
+  .rodata :
+  {
+    . = ALIGN(4);
+    *(.rodata)         /* .rodata sections (constants, strings, etc.) */
+    *(.rodata*)        /* .rodata* sections (constants, strings, etc.) */
+    . = ALIGN(4);
+  } >FLASH
+
+  .ARM.extab   : { *(.ARM.extab* .gnu.linkonce.armextab.*) } >FLASH
+  .ARM : {
+    __exidx_start = .;
+    *(.ARM.exidx*)
+    __exidx_end = .;
+  } >FLASH
+
+  .preinit_array     :
+  {
+    PROVIDE_HIDDEN (__preinit_array_start = .);
+    KEEP (*(.preinit_array*))
+    PROVIDE_HIDDEN (__preinit_array_end = .);
+  } >FLASH
+  .init_array :
+  {
+    PROVIDE_HIDDEN (__init_array_start = .);
+    KEEP (*(SORT(.init_array.*)))
+    KEEP (*(.init_array*))
+    PROVIDE_HIDDEN (__init_array_end = .);
+  } >FLASH
+  .fini_array :
+  {
+    PROVIDE_HIDDEN (__fini_array_start = .);
+    KEEP (*(SORT(.fini_array.*)))
+    KEEP (*(.fini_array*))
+    PROVIDE_HIDDEN (__fini_array_end = .);
+  } >FLASH
+
+  /* used by the startup to initialize data */
+  _sidata = LOADADDR(.data);
+
+  /* Initialized data sections goes into RAM, load LMA copy after code */
+  .data : 
+  {
+    . = ALIGN(4);
+    _sdata = .;        /* create a global symbol at data start */
+    *(.data)           /* .data sections */
+    *(.data*)          /* .data* sections */
+
+    . = ALIGN(4);
+    _edata = .;        /* define a global symbol at data end */
+  } >RAM AT> FLASH
+
+  _spim_init_base = LOADADDR(.spim);
+  _spim_init_length = SIZEOF(.spim);
+  
+  .spim :
+  {
+    . = ALIGN(4);
+    _spim_start = .;        /* create a global symbol at spim start */
+    *(.spim)                /* .spim sections */
+    *(.spim*)               /* .spim* sections */
+    . = ALIGN(4);
+    _spim_end = .;         /* define a global symbols at end of spim */
+  } >SPIM
+
+  /* Uninitialized data section */
+  . = ALIGN(4);
+  .bss :
+  {
+    /* This is used by the startup in order to initialize the .bss secion */
+    _sbss = .;         /* define a global symbol at bss start */
+    __bss_start__ = _sbss;
+    *(.bss)
+    *(.bss*)
+    *(COMMON)
+
+    . = ALIGN(4);
+    _ebss = .;         /* define a global symbol at bss end */
+    __bss_end__ = _ebss;
+  } >RAM
+
+  /* User_heap_stack section, used to check that there is enough RAM left */
+  ._user_heap_stack :
+  {
+    . = ALIGN(8);
+    PROVIDE ( end = . );
+    PROVIDE ( _end = . );
+    . = . + _Min_Heap_Size;
+    . = . + _Min_Stack_Size;
+    . = ALIGN(8);
+  } >RAM
+
+  /* Remove information from the standard libraries */
+  /DISCARD/ :
+  {
+    libc.a ( * )
+    libm.a ( * )
+    libgcc.a ( * )
+  }
+
+  .ARM.attributes 0 : { *(.ARM.attributes) }
+}

+ 168 - 0
libraries/cmsis/cm4/device_support/startup/gcc/linker/AT32F413xB_FLASH.ld

@@ -0,0 +1,168 @@
+/*
+*****************************************************************************
+**
+**  File        : AT32F413xB_FLASH.ld
+**
+**  Abstract    : Linker script for AT32F413xB Device with
+**                128KByte FLASH, 32KByte RAM
+**
+**                Set heap size, stack size and stack location according
+**                to application requirements.
+**
+**                Set memory bank area and size if external memory is used.
+**
+**  Target      : Artery Tek AT32
+**
+**  Environment : Arm gcc toolchain
+**
+*****************************************************************************
+*/
+
+/* Entry Point */
+ENTRY(Reset_Handler)
+
+/* Highest address of the user mode stack */
+_estack = 0x20008000;    /* end of RAM */
+
+/* Generate a link error if heap and stack don't fit into RAM */
+_Min_Heap_Size = 0x200;      /* required amount of heap  */
+_Min_Stack_Size = 0x400; /* required amount of stack */
+
+/* Specify the memory areas */
+MEMORY
+{
+FLASH (rx)      : ORIGIN = 0x08000000, LENGTH = 128K
+RAM (xrw)       : ORIGIN = 0x20000000, LENGTH = 32K
+SPIM (rx)       : ORIGIN = 0x08400000, LENGTH = 16384K
+}
+
+/* Define output sections */
+SECTIONS
+{
+  /* The startup code goes first into FLASH */
+  .isr_vector :
+  {
+    . = ALIGN(4);
+    KEEP(*(.isr_vector)) /* Startup code */
+    . = ALIGN(4);
+  } >FLASH
+
+  /* The program code and other data goes into FLASH */
+  .text :
+  {
+    . = ALIGN(4);
+    *(.text)           /* .text sections (code) */
+    *(.text*)          /* .text* sections (code) */
+    *(.glue_7)         /* glue arm to thumb code */
+    *(.glue_7t)        /* glue thumb to arm code */
+    *(.eh_frame)
+
+    KEEP (*(.init))
+    KEEP (*(.fini))
+
+    . = ALIGN(4);
+    _etext = .;        /* define a global symbols at end of code */
+  } >FLASH
+
+  /* Constant data goes into FLASH */
+  .rodata :
+  {
+    . = ALIGN(4);
+    *(.rodata)         /* .rodata sections (constants, strings, etc.) */
+    *(.rodata*)        /* .rodata* sections (constants, strings, etc.) */
+    . = ALIGN(4);
+  } >FLASH
+
+  .ARM.extab   : { *(.ARM.extab* .gnu.linkonce.armextab.*) } >FLASH
+  .ARM : {
+    __exidx_start = .;
+    *(.ARM.exidx*)
+    __exidx_end = .;
+  } >FLASH
+
+  .preinit_array     :
+  {
+    PROVIDE_HIDDEN (__preinit_array_start = .);
+    KEEP (*(.preinit_array*))
+    PROVIDE_HIDDEN (__preinit_array_end = .);
+  } >FLASH
+  .init_array :
+  {
+    PROVIDE_HIDDEN (__init_array_start = .);
+    KEEP (*(SORT(.init_array.*)))
+    KEEP (*(.init_array*))
+    PROVIDE_HIDDEN (__init_array_end = .);
+  } >FLASH
+  .fini_array :
+  {
+    PROVIDE_HIDDEN (__fini_array_start = .);
+    KEEP (*(SORT(.fini_array.*)))
+    KEEP (*(.fini_array*))
+    PROVIDE_HIDDEN (__fini_array_end = .);
+  } >FLASH
+
+  /* used by the startup to initialize data */
+  _sidata = LOADADDR(.data);
+
+  /* Initialized data sections goes into RAM, load LMA copy after code */
+  .data : 
+  {
+    . = ALIGN(4);
+    _sdata = .;        /* create a global symbol at data start */
+    *(.data)           /* .data sections */
+    *(.data*)          /* .data* sections */
+
+    . = ALIGN(4);
+    _edata = .;        /* define a global symbol at data end */
+  } >RAM AT> FLASH
+
+  _spim_init_base = LOADADDR(.spim);
+  _spim_init_length = SIZEOF(.spim);
+  
+  .spim :
+  {
+    . = ALIGN(4);
+    _spim_start = .;        /* create a global symbol at spim start */
+    *(.spim)                /* .spim sections */
+    *(.spim*)               /* .spim* sections */
+    . = ALIGN(4);
+    _spim_end = .;         /* define a global symbols at end of spim */
+  } >SPIM
+
+  /* Uninitialized data section */
+  . = ALIGN(4);
+  .bss :
+  {
+    /* This is used by the startup in order to initialize the .bss secion */
+    _sbss = .;         /* define a global symbol at bss start */
+    __bss_start__ = _sbss;
+    *(.bss)
+    *(.bss*)
+    *(COMMON)
+
+    . = ALIGN(4);
+    _ebss = .;         /* define a global symbol at bss end */
+    __bss_end__ = _ebss;
+  } >RAM
+
+  /* User_heap_stack section, used to check that there is enough RAM left */
+  ._user_heap_stack :
+  {
+    . = ALIGN(8);
+    PROVIDE ( end = . );
+    PROVIDE ( _end = . );
+    . = . + _Min_Heap_Size;
+    . = . + _Min_Stack_Size;
+    . = ALIGN(8);
+  } >RAM
+
+  /* Remove information from the standard libraries */
+  /DISCARD/ :
+  {
+    libc.a ( * )
+    libm.a ( * )
+    libgcc.a ( * )
+  }
+
+  .ARM.attributes 0 : { *(.ARM.attributes) }
+}

+ 168 - 0
libraries/cmsis/cm4/device_support/startup/gcc/linker/AT32F413xC_FLASH.ld

@@ -0,0 +1,168 @@
+/*
+*****************************************************************************
+**
+**  File        : AT32F413xC_FLASH.ld
+**
+**  Abstract    : Linker script for AT32F413xC Device with
+**                256KByte FLASH, 32KByte RAM
+**
+**                Set heap size, stack size and stack location according
+**                to application requirements.
+**
+**                Set memory bank area and size if external memory is used.
+**
+**  Target      : Artery Tek AT32
+**
+**  Environment : Arm gcc toolchain
+**
+*****************************************************************************
+*/
+
+/* Entry Point */
+ENTRY(Reset_Handler)
+
+/* Highest address of the user mode stack */
+_estack = 0x20008000;    /* end of RAM */
+
+/* Generate a link error if heap and stack don't fit into RAM */
+_Min_Heap_Size = 0x200;      /* required amount of heap  */
+_Min_Stack_Size = 0x400; /* required amount of stack */
+
+/* Specify the memory areas */
+MEMORY
+{
+FLASH (rx)      : ORIGIN = 0x08000000, LENGTH = 256K
+RAM (xrw)       : ORIGIN = 0x20000000, LENGTH = 32K
+SPIM (rx)       : ORIGIN = 0x08400000, LENGTH = 16384K
+}
+
+/* Define output sections */
+SECTIONS
+{
+  /* The startup code goes first into FLASH */
+  .isr_vector :
+  {
+    . = ALIGN(4);
+    KEEP(*(.isr_vector)) /* Startup code */
+    . = ALIGN(4);
+  } >FLASH
+
+  /* The program code and other data goes into FLASH */
+  .text :
+  {
+    . = ALIGN(4);
+    *(.text)           /* .text sections (code) */
+    *(.text*)          /* .text* sections (code) */
+    *(.glue_7)         /* glue arm to thumb code */
+    *(.glue_7t)        /* glue thumb to arm code */
+    *(.eh_frame)
+
+    KEEP (*(.init))
+    KEEP (*(.fini))
+
+    . = ALIGN(4);
+    _etext = .;        /* define a global symbols at end of code */
+  } >FLASH
+
+  /* Constant data goes into FLASH */
+  .rodata :
+  {
+    . = ALIGN(4);
+    *(.rodata)         /* .rodata sections (constants, strings, etc.) */
+    *(.rodata*)        /* .rodata* sections (constants, strings, etc.) */
+    . = ALIGN(4);
+  } >FLASH
+
+  .ARM.extab   : { *(.ARM.extab* .gnu.linkonce.armextab.*) } >FLASH
+  .ARM : {
+    __exidx_start = .;
+    *(.ARM.exidx*)
+    __exidx_end = .;
+  } >FLASH
+
+  .preinit_array     :
+  {
+    PROVIDE_HIDDEN (__preinit_array_start = .);
+    KEEP (*(.preinit_array*))
+    PROVIDE_HIDDEN (__preinit_array_end = .);
+  } >FLASH
+  .init_array :
+  {
+    PROVIDE_HIDDEN (__init_array_start = .);
+    KEEP (*(SORT(.init_array.*)))
+    KEEP (*(.init_array*))
+    PROVIDE_HIDDEN (__init_array_end = .);
+  } >FLASH
+  .fini_array :
+  {
+    PROVIDE_HIDDEN (__fini_array_start = .);
+    KEEP (*(SORT(.fini_array.*)))
+    KEEP (*(.fini_array*))
+    PROVIDE_HIDDEN (__fini_array_end = .);
+  } >FLASH
+
+  /* used by the startup to initialize data */
+  _sidata = LOADADDR(.data);
+
+  /* Initialized data sections goes into RAM, load LMA copy after code */
+  .data : 
+  {
+    . = ALIGN(4);
+    _sdata = .;        /* create a global symbol at data start */
+    *(.data)           /* .data sections */
+    *(.data*)          /* .data* sections */
+
+    . = ALIGN(4);
+    _edata = .;        /* define a global symbol at data end */
+  } >RAM AT> FLASH
+
+  _spim_init_base = LOADADDR(.spim);
+  _spim_init_length = SIZEOF(.spim);
+  
+  .spim :
+  {
+    . = ALIGN(4);
+    _spim_start = .;        /* create a global symbol at spim start */
+    *(.spim)                /* .spim sections */
+    *(.spim*)               /* .spim* sections */
+    . = ALIGN(4);
+    _spim_end = .;         /* define a global symbols at end of spim */
+  } >SPIM
+
+  /* Uninitialized data section */
+  . = ALIGN(4);
+  .bss :
+  {
+    /* This is used by the startup in order to initialize the .bss secion */
+    _sbss = .;         /* define a global symbol at bss start */
+    __bss_start__ = _sbss;
+    *(.bss)
+    *(.bss*)
+    *(COMMON)
+
+    . = ALIGN(4);
+    _ebss = .;         /* define a global symbol at bss end */
+    __bss_end__ = _ebss;
+  } >RAM
+
+  /* User_heap_stack section, used to check that there is enough RAM left */
+  ._user_heap_stack :
+  {
+    . = ALIGN(8);
+    PROVIDE ( end = . );
+    PROVIDE ( _end = . );
+    . = . + _Min_Heap_Size;
+    . = . + _Min_Stack_Size;
+    . = ALIGN(8);
+  } >RAM
+
+  /* Remove information from the standard libraries */
+  /DISCARD/ :
+  {
+    libc.a ( * )
+    libm.a ( * )
+    libgcc.a ( * )
+  }
+
+  .ARM.attributes 0 : { *(.ARM.attributes) }
+}

+ 431 - 0
libraries/cmsis/cm4/device_support/startup/gcc/startup_at32f413.s

@@ -0,0 +1,431 @@
+/**
+  ******************************************************************************
+  * @file     startup_at32f413.s
+  * @brief    at32f413xx devices vector table for gcc toolchain.
+  *           this module performs:
+  *           - set the initial sp
+  *           - set the initial pc == reset_handler,
+  *           - set the vector table entries with the exceptions isr address
+  *           - configure the clock system and the external sram to
+  *             be used as data memory (optional, to be enabled by user)
+  *           - branches to main in the c library (which eventually
+  *             calls main()).
+  *           after reset the cortex-m4 processor is in thread mode,
+  *           priority is privileged, and the stack is set to main.
+  ******************************************************************************
+  */
+
+  .syntax unified
+  .cpu cortex-m4
+  .fpu softvfp
+  .thumb
+
+.global  g_pfnVectors
+.global  Default_Handler
+
+/* start address for the initialization values of the .data section.
+defined in linker script */
+.word  _sidata
+/* start address for the .data section. defined in linker script */
+.word  _sdata
+/* end address for the .data section. defined in linker script */
+.word  _edata
+/* start address for the .bss section. defined in linker script */
+.word  _sbss
+/* end address for the .bss section. defined in linker script */
+.word  _ebss
+/* stack used for SystemInit_ExtMemCtl; always internal RAM used */
+
+/**
+ * @brief  This is the code that gets called when the processor first
+ *          starts execution following a reset event. Only the absolutely
+ *          necessary set is performed, after which the application
+ *          supplied main() routine is called.
+ * @param  None
+ * @retval None
+*/
+
+    .section  .text.Reset_Handler
+  .weak  Reset_Handler
+  .type  Reset_Handler, %function
+Reset_Handler:
+
+/* Copy the data segment initializers from flash to SRAM */
+  movs  r1, #0
+  b  LoopCopyDataInit
+
+CopyDataInit:
+  ldr  r3, =_sidata
+  ldr  r3, [r3, r1]
+  str  r3, [r0, r1]
+  adds  r1, r1, #4
+
+LoopCopyDataInit:
+  ldr  r0, =_sdata
+  ldr  r3, =_edata
+  adds  r2, r0, r1
+  cmp  r2, r3
+  bcc  CopyDataInit
+  ldr  r2, =_sbss
+  b  LoopFillZerobss
+/* Zero fill the bss segment. */
+FillZerobss:
+  movs  r3, #0
+  str  r3, [r2], #4
+
+LoopFillZerobss:
+  ldr  r3, = _ebss
+  cmp  r2, r3
+  bcc  FillZerobss
+
+/* Call the clock system intitialization function.*/
+  bl  SystemInit
+/* Call static constructors */
+  bl __libc_init_array
+/* Call the application's entry point.*/
+  bl  main
+  bx  lr
+.size  Reset_Handler, .-Reset_Handler
+
+/**
+ * @brief  This is the code that gets called when the processor receives an
+ *         unexpected interrupt.  This simply enters an infinite loop, preserving
+ *         the system state for examination by a debugger.
+ * @param  None
+ * @retval None
+*/
+    .section  .text.Default_Handler,"ax",%progbits
+Default_Handler:
+Infinite_Loop:
+  b  Infinite_Loop
+  .size  Default_Handler, .-Default_Handler
+/******************************************************************************
+*
+* The minimal vector table for a Cortex M3. Note that the proper constructs
+* must be placed on this to ensure that it ends up at physical address
+* 0x0000.0000.
+*
+*******************************************************************************/
+   .section  .isr_vector,"a",%progbits
+  .type  g_pfnVectors, %object
+  .size  g_pfnVectors, .-g_pfnVectors
+
+
+g_pfnVectors:
+  .word  _estack
+  .word  Reset_Handler
+  .word  NMI_Handler
+  .word  HardFault_Handler
+  .word  MemManage_Handler
+  .word  BusFault_Handler
+  .word  UsageFault_Handler
+  .word  0
+  .word  0
+  .word  0
+  .word  0
+  .word  SVC_Handler
+  .word  DebugMon_Handler
+  .word  0
+  .word  PendSV_Handler
+  .word  SysTick_Handler
+
+  /* External Interrupts */
+  .word  WWDT_IRQHandler                     /* Window Watchdog Timer                   */
+  .word  PVM_IRQHandler                      /* PVM through EXINT Line detect           */
+  .word  TAMPER_IRQHandler                   /* Tamper                                  */
+  .word  RTC_IRQHandler                      /* RTC                                     */
+  .word  FLASH_IRQHandler                    /* Flash                                   */
+  .word  CRM_IRQHandler                      /* CRM                                     */
+  .word  EXINT0_IRQHandler                   /* EXINT Line 0                            */
+  .word  EXINT1_IRQHandler                   /* EXINT Line 1                            */
+  .word  EXINT2_IRQHandler                   /* EXINT Line 2                            */
+  .word  EXINT3_IRQHandler                   /* EXINT Line 3                            */
+  .word  EXINT4_IRQHandler                   /* EXINT Line 4                            */
+  .word  DMA1_Channel1_IRQHandler            /* DMA1 Channel 1                          */
+  .word  DMA1_Channel2_IRQHandler            /* DMA1 Channel 2                          */
+  .word  DMA1_Channel3_IRQHandler            /* DMA1 Channel 3                          */
+  .word  DMA1_Channel4_IRQHandler            /* DMA1 Channel 4                          */
+  .word  DMA1_Channel5_IRQHandler            /* DMA1 Channel 5                          */
+  .word  DMA1_Channel6_IRQHandler            /* DMA1 Channel 6                          */
+  .word  DMA1_Channel7_IRQHandler            /* DMA1 Channel 7                          */
+  .word  ADC1_2_IRQHandler                   /* ADC1 & ADC2                             */
+  .word  USBFS_H_CAN1_TX_IRQHandler          /* USB High Priority or CAN1 TX            */
+  .word  USBFS_L_CAN1_RX0_IRQHandler         /* USB Low  Priority or CAN1 RX0           */
+  .word  CAN1_RX1_IRQHandler                 /* CAN1 RX1                                */
+  .word  CAN1_SE_IRQHandler                  /* CAN1 SE                                 */
+  .word  EXINT9_5_IRQHandler                 /* EXINT Line [9:5]                        */
+  .word  TMR1_BRK_TMR9_IRQHandler            /* TMR1 Brake and TMR9                     */
+  .word  TMR1_OVF_TMR10_IRQHandler           /* TMR1 Overflow and TMR10                 */
+  .word  TMR1_TRG_HALL_TMR11_IRQHandler      /* TMR1 Trigger and hall and TMR11         */
+  .word  TMR1_CH_IRQHandler                  /* TMR1 Channel                            */
+  .word  TMR2_GLOBAL_IRQHandler              /* TMR2                                    */
+  .word  TMR3_GLOBAL_IRQHandler              /* TMR3                                    */
+  .word  TMR4_GLOBAL_IRQHandler              /* TMR4                                    */
+  .word  I2C1_EVT_IRQHandler                 /* I2C1 Event                              */
+  .word  I2C1_ERR_IRQHandler                 /* I2C1 Error                              */
+  .word  I2C2_EVT_IRQHandler                 /* I2C2 Event                              */
+  .word  I2C2_ERR_IRQHandler                 /* I2C2 Error                              */
+  .word  SPI1_IRQHandler                     /* SPI1                                    */
+  .word  SPI2_IRQHandler                     /* Reserved                                */
+  .word  USART1_IRQHandler                   /* USART1                                  */
+  .word  USART2_IRQHandler                   /* USART2                                  */
+  .word  USART3_IRQHandler                   /* USART3                                  */
+  .word  EXINT15_10_IRQHandler               /* EXINT Line [15:10]                      */
+  .word  RTCAlarm_IRQHandler                 /* RTC Alarm through EXINT Line            */
+  .word  USBFSWakeUp_IRQHandler              /* USB Wakeup from suspend                 */
+  .word  TMR8_BRK_IRQHandler                 /* TMR8 Brake                              */
+  .word  TMR8_OVF_IRQHandler                 /* TMR8 Overflow                           */
+  .word  TMR8_TRG_HALL_IRQHandler            /* TMR8 Trigger and hall                   */
+  .word  TMR8_CH_IRQHandler                  /* TMR8 Channel                            */
+  .word  0                                   /* Reserved                                */
+  .word  0                                   /* Reserved                                */
+  .word  SDIO1_IRQHandler                    /* SDIO1                                   */
+  .word  TMR5_GLOBAL_IRQHandler              /* TMR5                                    */
+  .word  0                                   /* Reserved                                */
+  .word  UART4_IRQHandler                    /* UART4                                   */
+  .word  UART5_IRQHandler                    /* UART5                                   */
+  .word  0                                   /* Reserved                                */
+  .word  0                                   /* Reserved                                */
+  .word  DMA2_Channel1_IRQHandler            /* DMA2 Channel1                           */
+  .word  DMA2_Channel2_IRQHandler            /* DMA2 Channel2                           */
+  .word  DMA2_Channel3_IRQHandler            /* DMA2 Channel3                           */
+  .word  DMA2_Channel4_5_IRQHandler          /* DMA2 Channel4 & Channel5                */
+  .word  0                                   /* Reserved                                */
+  .word  0                                   /* Reserved                                */
+  .word  0                                   /* Reserved                                */
+  .word  0                                   /* Reserved                                */
+  .word  0                                   /* Reserved                                */
+  .word  0                                   /* Reserved                                */
+  .word  0                                   /* Reserved                                */
+  .word  0                                   /* Reserved                                */
+  .word  CAN2_TX_IRQHandler                  /* CAN2 TX                                 */
+  .word  CAN2_RX0_IRQHandler                 /* CAN2 RX0                                */
+  .word  CAN2_RX1_IRQHandler                 /* CAN2 RX1                                */
+  .word  CAN2_SE_IRQHandler                  /* CAN2 SE                                 */
+  .word  ACC_IRQHandler                      /* ACC                                     */
+  .word  USBFS_MAPH_IRQHandler               /* USB Map HP                              */
+  .word  USBFS_MAPL_IRQHandler               /* USB Map LP                              */
+  .word  DMA2_Channel6_7_IRQHandler          /* DMA2 Channel6 & Channel7                */
+
+/*******************************************************************************
+*
+* Provide weak aliases for each Exception handler to the Default_Handler.
+* As they are weak aliases, any function with the same name will override
+* this definition.
+*
+*******************************************************************************/
+   .weak      NMI_Handler
+   .thumb_set NMI_Handler,Default_Handler
+
+   .weak      HardFault_Handler
+   .thumb_set HardFault_Handler,Default_Handler
+
+   .weak      MemManage_Handler
+   .thumb_set MemManage_Handler,Default_Handler
+
+   .weak      BusFault_Handler
+   .thumb_set BusFault_Handler,Default_Handler
+
+   .weak      UsageFault_Handler
+   .thumb_set UsageFault_Handler,Default_Handler
+
+   .weak      SVC_Handler
+   .thumb_set SVC_Handler,Default_Handler
+
+   .weak      DebugMon_Handler
+   .thumb_set DebugMon_Handler,Default_Handler
+
+   .weak      PendSV_Handler
+   .thumb_set PendSV_Handler,Default_Handler
+
+   .weak      SysTick_Handler
+   .thumb_set SysTick_Handler,Default_Handler
+
+   .weak      WWDT_IRQHandler
+   .thumb_set WWDT_IRQHandler,Default_Handler
+
+   .weak      PVM_IRQHandler
+   .thumb_set PVM_IRQHandler,Default_Handler
+
+   .weak      TAMPER_IRQHandler
+   .thumb_set TAMPER_IRQHandler,Default_Handler
+
+   .weak      RTC_IRQHandler
+   .thumb_set RTC_IRQHandler,Default_Handler
+
+   .weak      FLASH_IRQHandler
+   .thumb_set FLASH_IRQHandler,Default_Handler
+
+   .weak      CRM_IRQHandler
+   .thumb_set CRM_IRQHandler,Default_Handler
+
+   .weak      EXINT0_IRQHandler
+   .thumb_set EXINT0_IRQHandler,Default_Handler
+
+   .weak      EXINT1_IRQHandler
+   .thumb_set EXINT1_IRQHandler,Default_Handler
+
+   .weak      EXINT2_IRQHandler
+   .thumb_set EXINT2_IRQHandler,Default_Handler
+
+   .weak      EXINT3_IRQHandler
+   .thumb_set EXINT3_IRQHandler,Default_Handler
+
+   .weak      EXINT4_IRQHandler
+   .thumb_set EXINT4_IRQHandler,Default_Handler
+
+   .weak      DMA1_Channel1_IRQHandler
+   .thumb_set DMA1_Channel1_IRQHandler,Default_Handler
+
+   .weak      DMA1_Channel2_IRQHandler
+   .thumb_set DMA1_Channel2_IRQHandler,Default_Handler
+
+   .weak      DMA1_Channel3_IRQHandler
+   .thumb_set DMA1_Channel3_IRQHandler,Default_Handler
+
+   .weak      DMA1_Channel4_IRQHandler
+   .thumb_set DMA1_Channel4_IRQHandler,Default_Handler
+
+   .weak      DMA1_Channel5_IRQHandler
+   .thumb_set DMA1_Channel5_IRQHandler,Default_Handler
+
+   .weak      DMA1_Channel6_IRQHandler
+   .thumb_set DMA1_Channel6_IRQHandler,Default_Handler
+
+   .weak      DMA1_Channel7_IRQHandler
+   .thumb_set DMA1_Channel7_IRQHandler,Default_Handler
+
+   .weak      ADC1_2_IRQHandler
+   .thumb_set ADC1_2_IRQHandler,Default_Handler
+
+   .weak      USBFS_H_CAN1_TX_IRQHandler
+   .thumb_set USBFS_H_CAN1_TX_IRQHandler,Default_Handler
+
+   .weak      USBFS_L_CAN1_RX0_IRQHandler
+   .thumb_set USBFS_L_CAN1_RX0_IRQHandler,Default_Handler
+
+   .weak      CAN1_RX1_IRQHandler
+   .thumb_set CAN1_RX1_IRQHandler,Default_Handler
+
+   .weak      CAN1_SE_IRQHandler
+   .thumb_set CAN1_SE_IRQHandler,Default_Handler
+
+   .weak      EXINT9_5_IRQHandler
+   .thumb_set EXINT9_5_IRQHandler,Default_Handler
+
+   .weak      TMR1_BRK_TMR9_IRQHandler
+   .thumb_set TMR1_BRK_TMR9_IRQHandler,Default_Handler
+
+   .weak      TMR1_OVF_TMR10_IRQHandler
+   .thumb_set TMR1_OVF_TMR10_IRQHandler,Default_Handler
+
+   .weak      TMR1_TRG_HALL_TMR11_IRQHandler
+   .thumb_set TMR1_TRG_HALL_TMR11_IRQHandler,Default_Handler
+
+   .weak      TMR1_CH_IRQHandler
+   .thumb_set TMR1_CH_IRQHandler,Default_Handler
+
+   .weak      TMR2_GLOBAL_IRQHandler
+   .thumb_set TMR2_GLOBAL_IRQHandler,Default_Handler
+
+   .weak      TMR3_GLOBAL_IRQHandler
+   .thumb_set TMR3_GLOBAL_IRQHandler,Default_Handler
+
+   .weak      TMR4_GLOBAL_IRQHandler
+   .thumb_set TMR4_GLOBAL_IRQHandler,Default_Handler
+
+   .weak      I2C1_EVT_IRQHandler
+   .thumb_set I2C1_EVT_IRQHandler,Default_Handler
+
+   .weak      I2C1_ERR_IRQHandler
+   .thumb_set I2C1_ERR_IRQHandler,Default_Handler
+
+   .weak      I2C2_EVT_IRQHandler
+   .thumb_set I2C2_EVT_IRQHandler,Default_Handler
+
+   .weak      I2C2_ERR_IRQHandler
+   .thumb_set I2C2_ERR_IRQHandler,Default_Handler
+
+   .weak      SPI1_IRQHandler
+   .thumb_set SPI1_IRQHandler,Default_Handler
+
+   .weak      SPI2_IRQHandler
+   .thumb_set SPI2_IRQHandler,Default_Handler
+
+   .weak      USART1_IRQHandler
+   .thumb_set USART1_IRQHandler,Default_Handler
+
+   .weak      USART2_IRQHandler
+   .thumb_set USART2_IRQHandler,Default_Handler
+
+   .weak      USART3_IRQHandler
+   .thumb_set USART3_IRQHandler,Default_Handler
+
+   .weak      EXINT15_10_IRQHandler
+   .thumb_set EXINT15_10_IRQHandler,Default_Handler
+
+   .weak      RTCAlarm_IRQHandler
+   .thumb_set RTCAlarm_IRQHandler,Default_Handler
+
+   .weak      USBFSWakeUp_IRQHandler
+   .thumb_set USBFSWakeUp_IRQHandler,Default_Handler
+
+   .weak      TMR8_BRK_IRQHandler
+   .thumb_set TMR8_BRK_IRQHandler,Default_Handler
+
+   .weak      TMR8_OVF_IRQHandler
+   .thumb_set TMR8_OVF_IRQHandler,Default_Handler
+
+   .weak      TMR8_TRG_HALL_IRQHandler
+   .thumb_set TMR8_TRG_HALL_IRQHandler,Default_Handler
+
+   .weak      TMR8_CH_IRQHandler
+   .thumb_set TMR8_CH_IRQHandler,Default_Handler
+
+   .weak      SDIO1_IRQHandler
+   .thumb_set SDIO1_IRQHandler,Default_Handler
+
+   .weak      TMR5_GLOBAL_IRQHandler
+   .thumb_set TMR5_GLOBAL_IRQHandler,Default_Handler
+
+   .weak      UART4_IRQHandler
+   .thumb_set UART4_IRQHandler,Default_Handler
+
+   .weak      UART5_IRQHandler
+   .thumb_set UART5_IRQHandler,Default_Handler
+
+   .weak      DMA2_Channel1_IRQHandler
+   .thumb_set DMA2_Channel1_IRQHandler,Default_Handler
+
+   .weak      DMA2_Channel2_IRQHandler
+   .thumb_set DMA2_Channel2_IRQHandler,Default_Handler
+
+   .weak      DMA2_Channel3_IRQHandler
+   .thumb_set DMA2_Channel3_IRQHandler,Default_Handler
+
+   .weak      DMA2_Channel4_5_IRQHandler
+   .thumb_set DMA2_Channel4_5_IRQHandler,Default_Handler
+
+   .weak      CAN2_TX_IRQHandler
+   .thumb_set CAN2_TX_IRQHandler,Default_Handler
+
+   .weak      CAN2_RX0_IRQHandler
+   .thumb_set CAN2_RX0_IRQHandler ,Default_Handler
+
+   .weak      CAN2_RX1_IRQHandler
+   .thumb_set CAN2_RX1_IRQHandler ,Default_Handler
+
+   .weak      CAN2_SE_IRQHandler
+   .thumb_set CAN2_SE_IRQHandler,Default_Handler
+
+   .weak      ACC_IRQHandler
+   .thumb_set ACC_IRQHandler,Default_Handler
+
+   .weak      USBFS_MAPH_IRQHandler
+   .thumb_set USBFS_MAPH_IRQHandler,Default_Handler
+
+   .weak      USBFS_MAPL_IRQHandler
+   .thumb_set USBFS_MAPL_IRQHandler,Default_Handler
+
+   .weak      DMA2_Channel6_7_IRQHandler
+   .thumb_set DMA2_Channel6_7_IRQHandler,Default_Handler

+ 30 - 0
libraries/cmsis/cm4/device_support/startup/iar/linker/AT32F413x8.icf

@@ -0,0 +1,30 @@
+/*###ICF### Section handled by ICF editor, don't touch! ****/
+/*-Editor annotation file-*/
+/* IcfEditorFile="$TOOLKIT_DIR$\config\ide\IcfEditor\cortex_v1_0.xml" */
+/*-Specials-*/
+define symbol __ICFEDIT_intvec_start__ = 0x08000000;
+/*-Memory Regions-*/
+define symbol __ICFEDIT_region_ROM_start__ = 0x08000000;
+define symbol __ICFEDIT_region_ROM_end__   = 0x0800FFFF;
+define symbol __ICFEDIT_region_RAM_start__ = 0x20000000;
+define symbol __ICFEDIT_region_RAM_end__   = 0x20007FFF;
+/*-Sizes-*/
+define symbol __ICFEDIT_size_cstack__ = 0x1000;
+define symbol __ICFEDIT_size_heap__   = 0x1000;
+/**** End of ICF editor section. ###ICF###*/
+
+define memory mem with size = 4G;
+define region ROM_region   = mem:[from __ICFEDIT_region_ROM_start__   to __ICFEDIT_region_ROM_end__];
+define region RAM_region   = mem:[from __ICFEDIT_region_RAM_start__   to __ICFEDIT_region_RAM_end__];
+
+define block CSTACK    with alignment = 8, size = __ICFEDIT_size_cstack__   { };
+define block HEAP      with alignment = 8, size = __ICFEDIT_size_heap__     { };
+
+initialize by copy { readwrite };
+do not initialize  { section .noinit };
+
+place at address mem:__ICFEDIT_intvec_start__ { readonly section .intvec };
+
+place in ROM_region   { readonly };
+place in RAM_region   { readwrite,
+                        block CSTACK, block HEAP };

+ 30 - 0
libraries/cmsis/cm4/device_support/startup/iar/linker/AT32F413xB.icf

@@ -0,0 +1,30 @@
+/*###ICF### Section handled by ICF editor, don't touch! ****/
+/*-Editor annotation file-*/
+/* IcfEditorFile="$TOOLKIT_DIR$\config\ide\IcfEditor\cortex_v1_0.xml" */
+/*-Specials-*/
+define symbol __ICFEDIT_intvec_start__ = 0x08000000;
+/*-Memory Regions-*/
+define symbol __ICFEDIT_region_ROM_start__ = 0x08000000;
+define symbol __ICFEDIT_region_ROM_end__   = 0x0801FFFF;
+define symbol __ICFEDIT_region_RAM_start__ = 0x20000000;
+define symbol __ICFEDIT_region_RAM_end__   = 0x20007FFF;
+/*-Sizes-*/
+define symbol __ICFEDIT_size_cstack__ = 0x1000;
+define symbol __ICFEDIT_size_heap__   = 0x1000;
+/**** End of ICF editor section. ###ICF###*/
+
+define memory mem with size = 4G;
+define region ROM_region   = mem:[from __ICFEDIT_region_ROM_start__   to __ICFEDIT_region_ROM_end__];
+define region RAM_region   = mem:[from __ICFEDIT_region_RAM_start__   to __ICFEDIT_region_RAM_end__];
+
+define block CSTACK    with alignment = 8, size = __ICFEDIT_size_cstack__   { };
+define block HEAP      with alignment = 8, size = __ICFEDIT_size_heap__     { };
+
+initialize by copy { readwrite };
+do not initialize  { section .noinit };
+
+place at address mem:__ICFEDIT_intvec_start__ { readonly section .intvec };
+
+place in ROM_region   { readonly };
+place in RAM_region   { readwrite,
+                        block CSTACK, block HEAP };

+ 30 - 0
libraries/cmsis/cm4/device_support/startup/iar/linker/AT32F413xC.icf

@@ -0,0 +1,30 @@
+/*###ICF### Section handled by ICF editor, don't touch! ****/
+/*-Editor annotation file-*/
+/* IcfEditorFile="$TOOLKIT_DIR$\config\ide\IcfEditor\cortex_v1_0.xml" */
+/*-Specials-*/
+define symbol __ICFEDIT_intvec_start__ = 0x08000000;
+/*-Memory Regions-*/
+define symbol __ICFEDIT_region_ROM_start__ = 0x08000000;
+define symbol __ICFEDIT_region_ROM_end__   = 0x0803FFFF;
+define symbol __ICFEDIT_region_RAM_start__ = 0x20000000;
+define symbol __ICFEDIT_region_RAM_end__   = 0x20007FFF;
+/*-Sizes-*/
+define symbol __ICFEDIT_size_cstack__ = 0x1000;
+define symbol __ICFEDIT_size_heap__   = 0x1000;
+/**** End of ICF editor section. ###ICF###*/
+
+define memory mem with size = 4G;
+define region ROM_region   = mem:[from __ICFEDIT_region_ROM_start__   to __ICFEDIT_region_ROM_end__];
+define region RAM_region   = mem:[from __ICFEDIT_region_RAM_start__   to __ICFEDIT_region_RAM_end__];
+
+define block CSTACK    with alignment = 8, size = __ICFEDIT_size_cstack__   { };
+define block HEAP      with alignment = 8, size = __ICFEDIT_size_heap__     { };
+
+initialize by copy { readwrite };
+do not initialize  { section .noinit };
+
+place at address mem:__ICFEDIT_intvec_start__ { readonly section .intvec };
+
+place in ROM_region   { readonly };
+place in RAM_region   { readwrite,
+                        block CSTACK, block HEAP };

+ 496 - 0
libraries/cmsis/cm4/device_support/startup/iar/startup_at32f413.s

@@ -0,0 +1,496 @@
+;**************************************************************************
+;* @file     startup_at32f413.s
+;* @brief    at32f413 startup file for IAR Systems
+;**************************************************************************
+;
+
+; Amount of memory (in bytes) allocated for Stack
+; Tailor this value to your application needs
+; <h> Stack Configuration
+;   <o> Stack Size (in Bytes) <0x0-0xFFFFFFFF:8>
+; </h>
+;
+
+        MODULE  ?cstartup
+
+        ;; Forward declaration of sections.
+        SECTION CSTACK:DATA:NOROOT(3)
+
+        SECTION .intvec:CODE:NOROOT(2)
+
+        EXTERN  __iar_program_start
+        EXTERN  SystemInit
+        PUBLIC  __vector_table
+
+        DATA
+__vector_table
+        DCD     sfe(CSTACK)
+        DCD     Reset_Handler                       ; Reset Handler
+        DCD     NMI_Handler                         ; NMI Handler
+        DCD     HardFault_Handler                   ; Hard Fault Handler
+        DCD     MemManage_Handler                   ; MPU Fault Handler
+        DCD     BusFault_Handler                    ; Bus Fault Handler
+        DCD     UsageFault_Handler                  ; Usage Fault Handler
+        DCD     0                                   ; Reserved
+        DCD     0                                   ; Reserved
+        DCD     0                                   ; Reserved
+        DCD     0                                   ; Reserved
+        DCD     SVC_Handler                         ; SVCall Handler
+        DCD     DebugMon_Handler                    ; Debug Monitor Handler
+        DCD     0                                   ; Reserved
+        DCD     PendSV_Handler                      ; PendSV Handler
+        DCD     SysTick_Handler                     ; SysTick Handler
+
+        ; External Interrupts
+        DCD     WWDT_IRQHandler                     ; Window Watchdog Timer
+        DCD     PVM_IRQHandler                      ; PVM through EXINT Line detect
+        DCD     TAMPER_IRQHandler                   ; Tamper
+        DCD     RTC_IRQHandler                      ; RTC
+        DCD     FLASH_IRQHandler                    ; Flash
+        DCD     CRM_IRQHandler                      ; CRM
+        DCD     EXINT0_IRQHandler                   ; EXINT Line 0
+        DCD     EXINT1_IRQHandler                   ; EXINT Line 1
+        DCD     EXINT2_IRQHandler                   ; EXINT Line 2
+        DCD     EXINT3_IRQHandler                   ; EXINT Line 3
+        DCD     EXINT4_IRQHandler                   ; EXINT Line 4
+        DCD     DMA1_Channel1_IRQHandler            ; DMA1 Channel 1
+        DCD     DMA1_Channel2_IRQHandler            ; DMA1 Channel 2
+        DCD     DMA1_Channel3_IRQHandler            ; DMA1 Channel 3
+        DCD     DMA1_Channel4_IRQHandler            ; DMA1 Channel 4
+        DCD     DMA1_Channel5_IRQHandler            ; DMA1 Channel 5
+        DCD     DMA1_Channel6_IRQHandler            ; DMA1 Channel 6
+        DCD     DMA1_Channel7_IRQHandler            ; DMA1 Channel 7
+        DCD     ADC1_2_IRQHandler                   ; ADC1 & ADC2
+        DCD     USBFS_H_CAN1_TX_IRQHandler          ; USB High Priority or CAN1 TX
+        DCD     USBFS_L_CAN1_RX0_IRQHandler         ; USB Low  Priority or CAN1 RX0
+        DCD     CAN1_RX1_IRQHandler                 ; CAN1 RX1
+        DCD     CAN1_SE_IRQHandler                  ; CAN1 SE
+        DCD     EXINT9_5_IRQHandler                 ; EXINT Line [9:5]
+        DCD     TMR1_BRK_TMR9_IRQHandler            ; TMR1 Brake and TMR9
+        DCD     TMR1_OVF_TMR10_IRQHandler           ; TMR1 Overflow and TMR10
+        DCD     TMR1_TRG_HALL_TMR11_IRQHandler      ; TMR1 Trigger and hall and TMR11
+        DCD     TMR1_CH_IRQHandler                  ; TMR1 Channel
+        DCD     TMR2_GLOBAL_IRQHandler              ; TMR2
+        DCD     TMR3_GLOBAL_IRQHandler              ; TMR3
+        DCD     TMR4_GLOBAL_IRQHandler              ; TMR4
+        DCD     I2C1_EVT_IRQHandler                 ; I2C1 Event
+        DCD     I2C1_ERR_IRQHandler                 ; I2C1 Error
+        DCD     I2C2_EVT_IRQHandler                 ; I2C2 Event
+        DCD     I2C2_ERR_IRQHandler                 ; I2C2 Error
+        DCD     SPI1_IRQHandler                     ; SPI1
+        DCD     SPI2_IRQHandler                     ; SPI2
+        DCD     USART1_IRQHandler                   ; USART1
+        DCD     USART2_IRQHandler                   ; USART2
+        DCD     USART3_IRQHandler                   ; USART3
+        DCD     EXINT15_10_IRQHandler               ; EXINT Line [15:10]
+        DCD     RTCAlarm_IRQHandler                 ; RTC Alarm through EXINT Line
+        DCD     USBFSWakeUp_IRQHandler              ; USB Wakeup from suspend
+        DCD     TMR8_BRK_IRQHandler                 ; TMR8 Brake
+        DCD     TMR8_OVF_IRQHandler                 ; TMR8 Overflow
+        DCD     TMR8_TRG_HALL_IRQHandler            ; TMR8 Trigger and hall
+        DCD     TMR8_CH_IRQHandler                  ; TMR8 Channel
+        DCD     0                                   ; Reserved
+        DCD     0                                   ; Reserved
+        DCD     SDIO1_IRQHandler                    ; SDIO1
+        DCD     TMR5_GLOBAL_IRQHandler              ; TMR5
+        DCD     0                                   ; Reserved
+        DCD     UART4_IRQHandler                    ; UART4
+        DCD     UART5_IRQHandler                    ; UART5
+        DCD     0                                   ; Reserved
+        DCD     0                                   ; Reserved
+        DCD     DMA2_Channel1_IRQHandler            ; DMA2 Channel1
+        DCD     DMA2_Channel2_IRQHandler            ; DMA2 Channel2
+        DCD     DMA2_Channel3_IRQHandler            ; DMA2 Channel3
+        DCD     DMA2_Channel4_5_IRQHandler          ; DMA2 Channel4 & Channel5
+        DCD     0                                   ; Reserved
+        DCD     0                                   ; Reserved
+        DCD     0                                   ; Reserved
+        DCD     0                                   ; Reserved
+        DCD     0                                   ; Reserved
+        DCD     0                                   ; Reserved
+        DCD     0                                   ; Reserved
+        DCD     0                                   ; Reserved
+        DCD     CAN2_TX_IRQHandler                  ; CAN2 TX
+        DCD     CAN2_RX0_IRQHandler                 ; CAN2 RX0
+        DCD     CAN2_RX1_IRQHandler                 ; CAN2 RX1
+        DCD     CAN2_SE_IRQHandler                  ; CAN2 SE
+        DCD     ACC_IRQHandler                      ; ACC
+        DCD     USBFS_MAPH_IRQHandler               ; USB Map HP
+        DCD     USBFS_MAPL_IRQHandler               ; USB Map LP
+        DCD     DMA2_Channel6_7_IRQHandler          ; DMA2 Channel6 & Channel7
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; Default interrupt handlers.
+;;
+        THUMB
+
+        PUBWEAK Reset_Handler
+        SECTION .text:CODE:REORDER:NOROOT(2)
+Reset_Handler
+        LDR     R0, =SystemInit
+        BLX     R0
+        LDR     R0, =__iar_program_start
+        BX      R0
+
+        PUBWEAK NMI_Handler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+NMI_Handler
+        B NMI_Handler
+
+        PUBWEAK HardFault_Handler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+HardFault_Handler
+        B HardFault_Handler
+
+        PUBWEAK MemManage_Handler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+MemManage_Handler
+        B MemManage_Handler
+
+        PUBWEAK BusFault_Handler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+BusFault_Handler
+        B BusFault_Handler
+
+        PUBWEAK UsageFault_Handler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+UsageFault_Handler
+        B UsageFault_Handler
+
+        PUBWEAK SVC_Handler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+SVC_Handler
+        B SVC_Handler
+
+        PUBWEAK DebugMon_Handler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+DebugMon_Handler
+        B DebugMon_Handler
+
+        PUBWEAK PendSV_Handler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+PendSV_Handler
+        B PendSV_Handler
+
+        PUBWEAK SysTick_Handler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+SysTick_Handler
+        B SysTick_Handler
+
+        PUBWEAK WWDT_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+WWDT_IRQHandler
+        B WWDT_IRQHandler
+
+        PUBWEAK PVM_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+PVM_IRQHandler
+        B PVM_IRQHandler
+
+        PUBWEAK TAMPER_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+TAMPER_IRQHandler
+        B TAMPER_IRQHandler
+
+        PUBWEAK RTC_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+RTC_IRQHandler
+        B RTC_IRQHandler
+
+        PUBWEAK FLASH_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+FLASH_IRQHandler
+        B FLASH_IRQHandler
+
+        PUBWEAK CRM_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+CRM_IRQHandler
+        B CRM_IRQHandler
+
+        PUBWEAK EXINT0_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+EXINT0_IRQHandler
+        B EXINT0_IRQHandler
+
+        PUBWEAK EXINT1_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+EXINT1_IRQHandler
+        B EXINT1_IRQHandler
+
+        PUBWEAK EXINT2_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+EXINT2_IRQHandler
+        B EXINT2_IRQHandler
+
+        PUBWEAK EXINT3_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+EXINT3_IRQHandler
+        B EXINT3_IRQHandler
+
+        PUBWEAK EXINT4_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+EXINT4_IRQHandler
+        B EXINT4_IRQHandler
+
+        PUBWEAK DMA1_Channel1_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+DMA1_Channel1_IRQHandler
+        B DMA1_Channel1_IRQHandler
+
+        PUBWEAK DMA1_Channel2_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+DMA1_Channel2_IRQHandler
+        B DMA1_Channel2_IRQHandler
+
+        PUBWEAK DMA1_Channel3_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+DMA1_Channel3_IRQHandler
+        B DMA1_Channel3_IRQHandler
+
+        PUBWEAK DMA1_Channel4_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+DMA1_Channel4_IRQHandler
+        B DMA1_Channel4_IRQHandler
+
+        PUBWEAK DMA1_Channel5_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+DMA1_Channel5_IRQHandler
+        B DMA1_Channel5_IRQHandler
+
+        PUBWEAK DMA1_Channel6_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+DMA1_Channel6_IRQHandler
+        B DMA1_Channel6_IRQHandler
+
+        PUBWEAK DMA1_Channel7_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+DMA1_Channel7_IRQHandler
+        B DMA1_Channel7_IRQHandler
+
+        PUBWEAK ADC1_2_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+ADC1_2_IRQHandler
+        B ADC1_2_IRQHandler
+
+        PUBWEAK USBFS_H_CAN1_TX_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+USBFS_H_CAN1_TX_IRQHandler
+        B USBFS_H_CAN1_TX_IRQHandler
+
+        PUBWEAK USBFS_L_CAN1_RX0_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+USBFS_L_CAN1_RX0_IRQHandler
+        B USBFS_L_CAN1_RX0_IRQHandler
+
+        PUBWEAK CAN1_RX1_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+CAN1_RX1_IRQHandler
+        B CAN1_RX1_IRQHandler
+
+        PUBWEAK CAN1_SE_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+CAN1_SE_IRQHandler
+        B CAN1_SE_IRQHandler
+
+        PUBWEAK EXINT9_5_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+EXINT9_5_IRQHandler
+        B EXINT9_5_IRQHandler
+
+        PUBWEAK TMR1_BRK_TMR9_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+TMR1_BRK_TMR9_IRQHandler
+        B TMR1_BRK_TMR9_IRQHandler
+
+        PUBWEAK TMR1_OVF_TMR10_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+TMR1_OVF_TMR10_IRQHandler
+        B TMR1_OVF_TMR10_IRQHandler
+
+        PUBWEAK TMR1_TRG_HALL_TMR11_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+TMR1_TRG_HALL_TMR11_IRQHandler
+        B TMR1_TRG_HALL_TMR11_IRQHandler
+
+        PUBWEAK TMR1_CH_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+TMR1_CH_IRQHandler
+        B TMR1_CH_IRQHandler
+
+        PUBWEAK TMR2_GLOBAL_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+TMR2_GLOBAL_IRQHandler
+        B TMR2_GLOBAL_IRQHandler
+
+        PUBWEAK TMR3_GLOBAL_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+TMR3_GLOBAL_IRQHandler
+        B TMR3_GLOBAL_IRQHandler
+
+        PUBWEAK TMR4_GLOBAL_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+TMR4_GLOBAL_IRQHandler
+        B TMR4_GLOBAL_IRQHandler
+
+        PUBWEAK I2C1_EVT_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+I2C1_EVT_IRQHandler
+        B I2C1_EVT_IRQHandler
+
+        PUBWEAK I2C1_ERR_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+I2C1_ERR_IRQHandler
+        B I2C1_ERR_IRQHandler
+
+        PUBWEAK I2C2_EVT_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+I2C2_EVT_IRQHandler
+        B I2C2_EVT_IRQHandler
+
+        PUBWEAK I2C2_ERR_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+I2C2_ERR_IRQHandler
+        B I2C2_ERR_IRQHandler
+
+        PUBWEAK SPI1_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+SPI1_IRQHandler
+        B SPI1_IRQHandler
+
+        PUBWEAK SPI2_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+SPI2_IRQHandler
+        B SPI2_IRQHandler
+
+        PUBWEAK USART1_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+USART1_IRQHandler
+        B USART1_IRQHandler
+
+        PUBWEAK USART2_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+USART2_IRQHandler
+        B USART2_IRQHandler
+
+        PUBWEAK USART3_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+USART3_IRQHandler
+        B USART3_IRQHandler
+
+        PUBWEAK EXINT15_10_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+EXINT15_10_IRQHandler
+        B EXINT15_10_IRQHandler
+
+        PUBWEAK RTCAlarm_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+RTCAlarm_IRQHandler
+        B RTCAlarm_IRQHandler
+
+        PUBWEAK USBFSWakeUp_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+USBFSWakeUp_IRQHandler
+        B USBFSWakeUp_IRQHandler
+
+        PUBWEAK TMR8_BRK_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+TMR8_BRK_IRQHandler
+        B TMR8_BRK_IRQHandler
+
+        PUBWEAK TMR8_OVF_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+TMR8_OVF_IRQHandler
+        B TMR8_OVF_IRQHandler
+
+        PUBWEAK TMR8_TRG_HALL_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+TMR8_TRG_HALL_IRQHandler
+        B TMR8_TRG_HALL_IRQHandler
+
+        PUBWEAK TMR8_CH_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+TMR8_CH_IRQHandler
+        B TMR8_CH_IRQHandler
+
+        PUBWEAK SDIO1_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+SDIO1_IRQHandler
+        B SDIO1_IRQHandler
+
+        PUBWEAK TMR5_GLOBAL_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+TMR5_GLOBAL_IRQHandler
+        B TMR5_GLOBAL_IRQHandler
+
+        PUBWEAK UART4_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+UART4_IRQHandler
+        B UART4_IRQHandler
+
+        PUBWEAK UART5_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+UART5_IRQHandler
+        B UART5_IRQHandler
+
+        PUBWEAK DMA2_Channel1_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+DMA2_Channel1_IRQHandler
+        B DMA2_Channel1_IRQHandler
+
+        PUBWEAK DMA2_Channel2_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+DMA2_Channel2_IRQHandler
+        B DMA2_Channel2_IRQHandler
+
+        PUBWEAK DMA2_Channel3_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+DMA2_Channel3_IRQHandler
+        B DMA2_Channel3_IRQHandler
+
+        PUBWEAK DMA2_Channel4_5_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+DMA2_Channel4_5_IRQHandler
+        B DMA2_Channel4_5_IRQHandler
+
+        PUBWEAK CAN2_TX_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+CAN2_TX_IRQHandler
+        B CAN2_TX_IRQHandler
+
+        PUBWEAK CAN2_RX0_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+CAN2_RX0_IRQHandler
+        B CAN2_RX0_IRQHandler
+
+        PUBWEAK CAN2_RX1_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+CAN2_RX1_IRQHandler
+        B CAN2_RX1_IRQHandler
+
+        PUBWEAK CAN2_SE_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+CAN2_SE_IRQHandler
+        B CAN2_SE_IRQHandler
+
+        PUBWEAK ACC_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+ACC_IRQHandler
+        B ACC_IRQHandler
+
+        PUBWEAK USBFS_MAPH_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+USBFS_MAPH_IRQHandler
+        B USBFS_MAPH_IRQHandler
+
+        PUBWEAK USBFS_MAPL_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+USBFS_MAPL_IRQHandler
+        B USBFS_MAPL_IRQHandler
+
+        PUBWEAK DMA2_Channel6_7_IRQHandler
+        SECTION .text:CODE:REORDER:NOROOT(1)
+DMA2_Channel6_7_IRQHandler
+        B DMA2_Channel6_7_IRQHandler
+
+        END

+ 357 - 0
libraries/cmsis/cm4/device_support/startup/mdk/startup_at32f413.s

@@ -0,0 +1,357 @@
+;**************************************************************************
+;* @file     startup_at32f413.s
+;* @brief    at32f413 startup file for keil
+;* <<< Use Configuration Wizard in Context Menu >>>  
+;**************************************************************************
+;
+
+; Amount of memory (in bytes) allocated for Stack
+; Tailor this value to your application needs
+; <h> Stack Configuration
+;   <o> Stack Size (in Bytes) <0x0-0xFFFFFFFF:8>
+; </h>
+
+Stack_Size      EQU     0x00000400
+
+                AREA    STACK, NOINIT, READWRITE, ALIGN=3
+Stack_Mem       SPACE   Stack_Size
+__initial_sp
+
+; <h> Heap Configuration
+;   <o>  Heap Size (in Bytes) <0x0-0xFFFFFFFF:8>
+; </h>
+
+Heap_Size       EQU     0x00000200
+
+                AREA    HEAP, NOINIT, READWRITE, ALIGN=3
+__heap_base
+Heap_Mem        SPACE   Heap_Size
+__heap_limit
+
+                PRESERVE8
+                THUMB
+
+
+; Vector Table Mapped to Address 0 at Reset
+                AREA    RESET, DATA, READONLY
+                EXPORT  __Vectors
+                EXPORT  __Vectors_End
+                EXPORT  __Vectors_Size
+
+__Vectors       DCD     __initial_sp                        ; Top of Stack
+                DCD     Reset_Handler                       ; Reset Handler
+                DCD     NMI_Handler                         ; NMI Handler
+                DCD     HardFault_Handler                   ; Hard Fault Handler
+                DCD     MemManage_Handler                   ; MPU Fault Handler
+                DCD     BusFault_Handler                    ; Bus Fault Handler
+                DCD     UsageFault_Handler                  ; Usage Fault Handler
+                DCD     0                                   ; Reserved
+                DCD     0                                   ; Reserved
+                DCD     0                                   ; Reserved
+                DCD     0                                   ; Reserved
+                DCD     SVC_Handler                         ; SVCall Handler
+                DCD     DebugMon_Handler                    ; Debug Monitor Handler
+                DCD     0                                   ; Reserved
+                DCD     PendSV_Handler                      ; PendSV Handler
+                DCD     SysTick_Handler                     ; SysTick Handler
+
+                ; External Interrupts
+                DCD     WWDT_IRQHandler                     ; Window Watchdog Timer
+                DCD     PVM_IRQHandler                      ; PVM through EXINT Line detect
+                DCD     TAMPER_IRQHandler                   ; Tamper
+                DCD     RTC_IRQHandler                      ; RTC
+                DCD     FLASH_IRQHandler                    ; Flash
+                DCD     CRM_IRQHandler                      ; CRM
+                DCD     EXINT0_IRQHandler                   ; EXINT Line 0
+                DCD     EXINT1_IRQHandler                   ; EXINT Line 1
+                DCD     EXINT2_IRQHandler                   ; EXINT Line 2
+                DCD     EXINT3_IRQHandler                   ; EXINT Line 3
+                DCD     EXINT4_IRQHandler                   ; EXINT Line 4
+                DCD     DMA1_Channel1_IRQHandler            ; DMA1 Channel 1
+                DCD     DMA1_Channel2_IRQHandler            ; DMA1 Channel 2
+                DCD     DMA1_Channel3_IRQHandler            ; DMA1 Channel 3
+                DCD     DMA1_Channel4_IRQHandler            ; DMA1 Channel 4
+                DCD     DMA1_Channel5_IRQHandler            ; DMA1 Channel 5
+                DCD     DMA1_Channel6_IRQHandler            ; DMA1 Channel 6
+                DCD     DMA1_Channel7_IRQHandler            ; DMA1 Channel 7
+                DCD     ADC1_2_IRQHandler                   ; ADC1 & ADC2
+                DCD     USBFS_H_CAN1_TX_IRQHandler          ; USB High Priority or CAN1 TX
+                DCD     USBFS_L_CAN1_RX0_IRQHandler         ; USB Low  Priority or CAN1 RX0
+                DCD     CAN1_RX1_IRQHandler                 ; CAN1 RX1
+                DCD     CAN1_SE_IRQHandler                  ; CAN1 SE
+                DCD     EXINT9_5_IRQHandler                 ; EXINT Line [9:5]
+                DCD     TMR1_BRK_TMR9_IRQHandler            ; TMR1 Brake and TMR9
+                DCD     TMR1_OVF_TMR10_IRQHandler           ; TMR1 overflow and TMR10
+                DCD     TMR1_TRG_HALL_TMR11_IRQHandler      ; TMR1 Trigger and hall and TMR11
+                DCD     TMR1_CH_IRQHandler                  ; TMR1 channel
+                DCD     TMR2_GLOBAL_IRQHandler              ; TMR2
+                DCD     TMR3_GLOBAL_IRQHandler              ; TMR3
+                DCD     TMR4_GLOBAL_IRQHandler              ; TMR4
+                DCD     I2C1_EVT_IRQHandler                 ; I2C1 Event
+                DCD     I2C1_ERR_IRQHandler                 ; I2C1 Error
+                DCD     I2C2_EVT_IRQHandler                 ; I2C2 Event
+                DCD     I2C2_ERR_IRQHandler                 ; I2C2 Error
+                DCD     SPI1_IRQHandler                     ; SPI1
+                DCD     SPI2_IRQHandler                     ; SPI2
+                DCD     USART1_IRQHandler                   ; USART1
+                DCD     USART2_IRQHandler                   ; USART2
+                DCD     USART3_IRQHandler                   ; USART3
+                DCD     EXINT15_10_IRQHandler               ; EXINT Line [15:10]
+                DCD     RTCAlarm_IRQHandler                 ; RTC Alarm through EXINT Line
+                DCD     USBFSWakeUp_IRQHandler              ; USB Wakeup from suspend
+                DCD     TMR8_BRK_IRQHandler                 ; TMR8 Brake
+                DCD     TMR8_OVF_IRQHandler                 ; TMR8 overflow
+                DCD     TMR8_TRG_HALL_IRQHandler            ; TMR8 Trigger and hall
+                DCD     TMR8_CH_IRQHandler                  ; TMR8 channel
+                DCD     0                                   ; Reserved
+                DCD     0                                   ; Reserved
+                DCD     SDIO1_IRQHandler                    ; SDIO1
+                DCD     TMR5_GLOBAL_IRQHandler              ; TMR5
+                DCD     0                                   ; Reserved
+                DCD     UART4_IRQHandler                    ; UART4
+                DCD     UART5_IRQHandler                    ; UART5
+                DCD     0                                   ; Reserved
+                DCD     0                                   ; Reserved
+                DCD     DMA2_Channel1_IRQHandler            ; DMA2 Channel1
+                DCD     DMA2_Channel2_IRQHandler            ; DMA2 Channel2
+                DCD     DMA2_Channel3_IRQHandler            ; DMA2 Channel3
+                DCD     DMA2_Channel4_5_IRQHandler          ; DMA2 Channel4 & Channel5
+                DCD     0                                   ; Reserved
+                DCD     0                                   ; Reserved
+                DCD     0                                   ; Reserved
+                DCD     0                                   ; Reserved
+                DCD     0                                   ; Reserved
+                DCD     0                                   ; Reserved
+                DCD     0                                   ; Reserved
+                DCD     0                                   ; Reserved
+                DCD     CAN2_TX_IRQHandler                  ; CAN2 TX
+                DCD     CAN2_RX0_IRQHandler                 ; CAN2 RX0
+                DCD     CAN2_RX1_IRQHandler                 ; CAN2 RX1
+                DCD     CAN2_SE_IRQHandler                  ; CAN2 SE
+                DCD     ACC_IRQHandler                      ; ACC
+                DCD     USBFS_MAPH_IRQHandler               ; USB Map High
+                DCD     USBFS_MAPL_IRQHandler               ; USB Map Low
+                DCD     DMA2_Channel6_7_IRQHandler          ; DMA2 Channel6 & Channel7
+__Vectors_End
+
+__Vectors_Size  EQU  __Vectors_End - __Vectors
+
+                AREA    |.text|, CODE, READONLY
+
+; Reset handler
+Reset_Handler   PROC
+                EXPORT  Reset_Handler                       [WEAK]
+                IMPORT  __main
+                IMPORT  SystemInit
+                LDR     R0, =SystemInit
+                BLX     R0
+                LDR     R0, =__main
+                BX      R0
+                ENDP
+
+; Dummy Exception Handlers (infinite loops which can be modified)
+
+NMI_Handler     PROC
+                EXPORT  NMI_Handler                         [WEAK]
+                B       .
+                ENDP
+HardFault_Handler\
+                PROC
+                EXPORT  HardFault_Handler                   [WEAK]
+                B       .
+                ENDP
+MemManage_Handler\
+                PROC
+                EXPORT  MemManage_Handler                   [WEAK]
+                B       .
+                ENDP
+BusFault_Handler\
+                PROC
+                EXPORT  BusFault_Handler                    [WEAK]
+                B       .
+                ENDP
+UsageFault_Handler\
+                PROC
+                EXPORT  UsageFault_Handler                  [WEAK]
+                B       .
+                ENDP
+SVC_Handler     PROC
+                EXPORT  SVC_Handler                         [WEAK]
+                B       .
+                ENDP
+DebugMon_Handler\
+                PROC
+                EXPORT  DebugMon_Handler                    [WEAK]
+                B       .
+                ENDP
+PendSV_Handler  PROC
+                EXPORT  PendSV_Handler                      [WEAK]
+                B       .
+                ENDP
+SysTick_Handler PROC
+                EXPORT  SysTick_Handler                     [WEAK]
+                B       .
+                ENDP
+
+Default_Handler PROC
+
+                EXPORT  WWDT_IRQHandler                     [WEAK]
+                EXPORT  PVM_IRQHandler                      [WEAK]
+                EXPORT  TAMPER_IRQHandler                   [WEAK]
+                EXPORT  RTC_IRQHandler                      [WEAK]
+                EXPORT  FLASH_IRQHandler                    [WEAK]
+                EXPORT  CRM_IRQHandler                      [WEAK]
+                EXPORT  EXINT0_IRQHandler                   [WEAK]
+                EXPORT  EXINT1_IRQHandler                   [WEAK]
+                EXPORT  EXINT2_IRQHandler                   [WEAK]
+                EXPORT  EXINT3_IRQHandler                   [WEAK]
+                EXPORT  EXINT4_IRQHandler                   [WEAK]
+                EXPORT  DMA1_Channel1_IRQHandler            [WEAK]
+                EXPORT  DMA1_Channel2_IRQHandler            [WEAK]
+                EXPORT  DMA1_Channel3_IRQHandler            [WEAK]
+                EXPORT  DMA1_Channel4_IRQHandler            [WEAK]
+                EXPORT  DMA1_Channel5_IRQHandler            [WEAK]
+                EXPORT  DMA1_Channel6_IRQHandler            [WEAK]
+                EXPORT  DMA1_Channel7_IRQHandler            [WEAK]
+                EXPORT  ADC1_2_IRQHandler                   [WEAK]
+                EXPORT  USBFS_H_CAN1_TX_IRQHandler          [WEAK]
+                EXPORT  USBFS_L_CAN1_RX0_IRQHandler         [WEAK]
+                EXPORT  CAN1_RX1_IRQHandler                 [WEAK]
+                EXPORT  CAN1_SE_IRQHandler                  [WEAK]
+                EXPORT  EXINT9_5_IRQHandler                 [WEAK]
+                EXPORT  TMR1_BRK_TMR9_IRQHandler            [WEAK]
+                EXPORT  TMR1_OVF_TMR10_IRQHandler           [WEAK]
+                EXPORT  TMR1_TRG_HALL_TMR11_IRQHandler      [WEAK]
+                EXPORT  TMR1_CH_IRQHandler                  [WEAK]
+                EXPORT  TMR2_GLOBAL_IRQHandler              [WEAK]
+                EXPORT  TMR3_GLOBAL_IRQHandler              [WEAK]
+                EXPORT  TMR4_GLOBAL_IRQHandler              [WEAK]
+                EXPORT  I2C1_EVT_IRQHandler                 [WEAK]
+                EXPORT  I2C1_ERR_IRQHandler                 [WEAK]
+                EXPORT  I2C2_EVT_IRQHandler                 [WEAK]
+                EXPORT  I2C2_ERR_IRQHandler                 [WEAK]
+                EXPORT  SPI1_IRQHandler                     [WEAK]
+                EXPORT  SPI2_IRQHandler                     [WEAK]
+                EXPORT  USART1_IRQHandler                   [WEAK]
+                EXPORT  USART2_IRQHandler                   [WEAK]
+                EXPORT  USART3_IRQHandler                   [WEAK]
+                EXPORT  EXINT15_10_IRQHandler               [WEAK]
+                EXPORT  RTCAlarm_IRQHandler                 [WEAK]
+                EXPORT  USBFSWakeUp_IRQHandler              [WEAK]
+                EXPORT  TMR8_BRK_IRQHandler                 [WEAK]
+                EXPORT  TMR8_OVF_IRQHandler                 [WEAK]
+                EXPORT  TMR8_TRG_HALL_IRQHandler            [WEAK]
+                EXPORT  TMR8_CH_IRQHandler                  [WEAK]
+                EXPORT  SDIO1_IRQHandler                    [WEAK]
+                EXPORT  TMR5_GLOBAL_IRQHandler              [WEAK]
+                EXPORT  UART4_IRQHandler                    [WEAK]
+                EXPORT  UART5_IRQHandler                    [WEAK]
+                EXPORT  DMA2_Channel1_IRQHandler            [WEAK]
+                EXPORT  DMA2_Channel2_IRQHandler            [WEAK]
+                EXPORT  DMA2_Channel3_IRQHandler            [WEAK]
+                EXPORT  DMA2_Channel4_5_IRQHandler          [WEAK]
+                EXPORT  CAN2_TX_IRQHandler                  [WEAK]
+                EXPORT  CAN2_RX0_IRQHandler                 [WEAK]
+                EXPORT  CAN2_RX1_IRQHandler                 [WEAK]
+                EXPORT  CAN2_SE_IRQHandler                  [WEAK]
+                EXPORT  ACC_IRQHandler                      [WEAK]
+                EXPORT  USBFS_MAPH_IRQHandler               [WEAK]
+                EXPORT  USBFS_MAPL_IRQHandler               [WEAK]
+                EXPORT  DMA2_Channel6_7_IRQHandler          [WEAK]
+
+WWDT_IRQHandler
+PVM_IRQHandler
+TAMPER_IRQHandler
+RTC_IRQHandler
+FLASH_IRQHandler
+CRM_IRQHandler
+EXINT0_IRQHandler
+EXINT1_IRQHandler
+EXINT2_IRQHandler
+EXINT3_IRQHandler
+EXINT4_IRQHandler
+DMA1_Channel1_IRQHandler
+DMA1_Channel2_IRQHandler
+DMA1_Channel3_IRQHandler
+DMA1_Channel4_IRQHandler
+DMA1_Channel5_IRQHandler
+DMA1_Channel6_IRQHandler
+DMA1_Channel7_IRQHandler
+ADC1_2_IRQHandler
+USBFS_H_CAN1_TX_IRQHandler
+USBFS_L_CAN1_RX0_IRQHandler
+CAN1_RX1_IRQHandler
+CAN1_SE_IRQHandler
+EXINT9_5_IRQHandler
+TMR1_BRK_TMR9_IRQHandler
+TMR1_OVF_TMR10_IRQHandler
+TMR1_TRG_HALL_TMR11_IRQHandler
+TMR1_CH_IRQHandler
+TMR2_GLOBAL_IRQHandler
+TMR3_GLOBAL_IRQHandler
+TMR4_GLOBAL_IRQHandler
+I2C1_EVT_IRQHandler
+I2C1_ERR_IRQHandler
+I2C2_EVT_IRQHandler
+I2C2_ERR_IRQHandler
+SPI1_IRQHandler
+SPI2_IRQHandler
+USART1_IRQHandler
+USART2_IRQHandler
+USART3_IRQHandler
+EXINT15_10_IRQHandler
+RTCAlarm_IRQHandler
+USBFSWakeUp_IRQHandler
+TMR8_BRK_IRQHandler
+TMR8_OVF_IRQHandler
+TMR8_TRG_HALL_IRQHandler
+TMR8_CH_IRQHandler
+SDIO1_IRQHandler
+TMR5_GLOBAL_IRQHandler
+UART4_IRQHandler
+UART5_IRQHandler
+DMA2_Channel1_IRQHandler
+DMA2_Channel2_IRQHandler
+DMA2_Channel3_IRQHandler
+DMA2_Channel4_5_IRQHandler
+CAN2_TX_IRQHandler
+CAN2_RX0_IRQHandler
+CAN2_RX1_IRQHandler
+CAN2_SE_IRQHandler
+ACC_IRQHandler
+USBFS_MAPH_IRQHandler
+USBFS_MAPL_IRQHandler
+DMA2_Channel6_7_IRQHandler
+                B       .
+
+                ENDP
+
+                ALIGN
+
+;*******************************************************************************
+; User Stack and Heap initialization
+;*******************************************************************************
+                 IF      :DEF:__MICROLIB
+
+                 EXPORT  __initial_sp
+                 EXPORT  __heap_base
+                 EXPORT  __heap_limit
+
+                 ELSE
+
+                 IMPORT  __use_two_region_memory
+                 EXPORT  __user_initial_stackheap
+
+__user_initial_stackheap
+
+                 LDR     R0, = Heap_Mem
+                 LDR     R1, = (Stack_Mem + Stack_Size)
+                 LDR     R2, = (Heap_Mem +  Heap_Size)
+                 LDR     R3, = Stack_Mem
+                 BX      LR
+
+                 ALIGN
+
+                 ENDIF
+
+                 END

+ 189 - 0
libraries/cmsis/cm4/device_support/system_at32f413.c

@@ -0,0 +1,189 @@
+/**
+  **************************************************************************
+  * @file     system_at32f413.c
+  * @brief    contains all the functions for cmsis cortex-m4 system source file
+  **************************************************************************
+  *                       Copyright notice & Disclaimer
+  *
+  * The software Board Support Package (BSP) that is made available to
+  * download from Artery official website is the copyrighted work of Artery.
+  * Artery authorizes customers to use, copy, and distribute the BSP
+  * software and its related documentation for the purpose of design and
+  * development in conjunction with Artery microcontrollers. Use of the
+  * software is governed by this copyright notice and the following disclaimer.
+  *
+  * THIS SOFTWARE IS PROVIDED ON "AS IS" BASIS WITHOUT WARRANTIES,
+  * GUARANTEES OR REPRESENTATIONS OF ANY KIND. ARTERY EXPRESSLY DISCLAIMS,
+  * TO THE FULLEST EXTENT PERMITTED BY LAW, ALL EXPRESS, IMPLIED OR
+  * STATUTORY OR OTHER WARRANTIES, GUARANTEES OR REPRESENTATIONS,
+  * INCLUDING BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
+  *
+  **************************************************************************
+  */
+
+/** @addtogroup CMSIS
+  * @{
+  */
+
+/** @addtogroup AT32F413_system
+  * @{
+  */
+
+#include "at32f413.h"
+
+/** @addtogroup AT32F413_system_private_defines
+  * @{
+  */
+#define VECT_TAB_OFFSET                  0x0 /*!< vector table base offset field. this value must be a multiple of 0x200. */
+/**
+  * @}
+  */
+
+/** @addtogroup AT32F413_system_private_variables
+  * @{
+  */
+unsigned int system_core_clock           = HICK_VALUE; /*!< system clock frequency (core clock) */
+/**
+  * @}
+  */
+
+/** @addtogroup AT32F413_system_private_functions
+  * @{
+  */
+
+/**
+  * @brief  setup the microcontroller system
+  *         initialize the flash interface.
+  * @note   this function should be used only after reset.
+  * @param  none
+  * @retval none
+  */
+void SystemInit (void)
+{
+#if defined (__FPU_USED) && (__FPU_USED == 1U)
+  SCB->CPACR |= ((3U << 10U * 2U) |         /* set cp10 full access */
+                 (3U << 11U * 2U)  );       /* set cp11 full access */
+#endif
+
+  /* reset the crm clock configuration to the default reset state(for debug purpose) */
+  /* set hicken bit */
+  CRM->ctrl_bit.hicken = TRUE;
+
+  /* wait hick stable */
+  while(CRM->ctrl_bit.hickstbl != SET);
+
+  /* hick used as system clock */
+  CRM->cfg_bit.sclksel = CRM_SCLK_HICK;
+
+  /* wait sclk switch status */
+  while(CRM->cfg_bit.sclksts != CRM_SCLK_HICK);
+
+  /* reset hexten, hextbyps, cfden and pllen bits */
+  CRM->ctrl &= ~(0x010D0000U);
+
+  /* reset cfg register, include sclk switch, ahbdiv, apb1div, apb2div, adcdiv,
+     clkout pllrcs, pllhextdiv, pllmult, usbdiv and pllrange bits */
+  CRM->cfg = 0;
+
+  /* reset clkout[3], usbbufs, hickdiv, clkoutdiv */
+  CRM->misc1 = 0;
+
+  /* disable all interrupts enable and clear pending bits  */
+  CRM->clkint = 0x009F0000;
+
+#ifdef VECT_TAB_SRAM
+  SCB->VTOR = SRAM_BASE  | VECT_TAB_OFFSET;  /* vector table relocation in internal sram. */
+#else
+  SCB->VTOR = FLASH_BASE | VECT_TAB_OFFSET;  /* vector table relocation in internal flash. */
+#endif
+}
+
+/**
+  * @brief  update system_core_clock variable according to clock register values.
+  *         the system_core_clock variable contains the core clock (hclk), it can
+  *         be used by the user application to setup the systick timer or configure
+  *         other parameters.
+  * @param  none
+  * @retval none
+  */
+void system_core_clock_update(void)
+{
+  uint32_t pll_mult = 0, pll_mult_h = 0, pll_clock_source = 0, temp = 0, div_value = 0;
+  crm_sclk_type sclk_source;
+
+  static const uint8_t sys_ahb_div_table[16] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 6, 7, 8, 9};
+
+  /* get sclk source */
+  sclk_source = crm_sysclk_switch_status_get();
+
+  switch(sclk_source)
+  {
+    case CRM_SCLK_HICK:
+      if(((CRM->misc3_bit.hick_to_sclk) != RESET) && ((CRM->misc1_bit.hickdiv) != RESET))
+        system_core_clock = HICK_VALUE * 6;
+      else
+        system_core_clock = HICK_VALUE;
+      break;
+    case CRM_SCLK_HEXT:
+      system_core_clock = HEXT_VALUE;
+      break;
+    case CRM_SCLK_PLL:
+      pll_clock_source = CRM->cfg_bit.pllrcs;
+      {
+        /* get multiplication factor */
+        pll_mult = CRM->cfg_bit.pllmult_l;
+        pll_mult_h = CRM->cfg_bit.pllmult_h;
+        /* process high bits */
+        if((pll_mult_h != 0U) || (pll_mult == 15U)){
+            pll_mult += ((16U * pll_mult_h) + 1U);
+        }
+        else
+        {
+            pll_mult += 2U;
+        }
+
+        if (pll_clock_source == 0x00)
+        {
+          /* hick divided by 2 selected as pll clock entry */
+          system_core_clock = (HICK_VALUE >> 1) * pll_mult;
+        }
+        else
+        {
+          /* hext selected as pll clock entry */
+          if (CRM->cfg_bit.pllhextdiv != RESET)
+          {
+            /* hext clock divided by 2 */
+            system_core_clock = (HEXT_VALUE / 2) * pll_mult;
+          }
+          else
+          {
+            system_core_clock = HEXT_VALUE * pll_mult;
+          }
+        }
+      }
+      break;
+    default:
+      system_core_clock = HICK_VALUE;
+      break;
+  }
+
+  /* compute sclk, ahbclk frequency */
+  /* get ahb division */
+  temp = CRM->cfg_bit.ahbdiv;
+  div_value = sys_ahb_div_table[temp];
+  /* ahbclk frequency */
+  system_core_clock = system_core_clock >> div_value;
+}
+/**
+  * @}
+  */
+
+/**
+  * @}
+  */
+
+/**
+  * @}
+  */
+

+ 89 - 0
libraries/cmsis/cm4/device_support/system_at32f413.h

@@ -0,0 +1,89 @@
+/**
+  **************************************************************************
+  * @file     system_at32f413.h
+  * @brief    cmsis cortex-m4 system header file.
+  **************************************************************************
+  *                       Copyright notice & Disclaimer
+  *
+  * The software Board Support Package (BSP) that is made available to
+  * download from Artery official website is the copyrighted work of Artery.
+  * Artery authorizes customers to use, copy, and distribute the BSP
+  * software and its related documentation for the purpose of design and
+  * development in conjunction with Artery microcontrollers. Use of the
+  * software is governed by this copyright notice and the following disclaimer.
+  *
+  * THIS SOFTWARE IS PROVIDED ON "AS IS" BASIS WITHOUT WARRANTIES,
+  * GUARANTEES OR REPRESENTATIONS OF ANY KIND. ARTERY EXPRESSLY DISCLAIMS,
+  * TO THE FULLEST EXTENT PERMITTED BY LAW, ALL EXPRESS, IMPLIED OR
+  * STATUTORY OR OTHER WARRANTIES, GUARANTEES OR REPRESENTATIONS,
+  * INCLUDING BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
+  *
+  **************************************************************************
+  */
+
+#ifndef __SYSTEM_AT32F413_H
+#define __SYSTEM_AT32F413_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup CMSIS
+  * @{
+  */
+
+/** @addtogroup AT32F413_system
+  * @{
+  */
+
+/** @defgroup AT32F413_system_clock_stable_definition
+  * @{
+  */
+
+#define HEXT_STABLE_DELAY                (5000u)
+#define PLL_STABLE_DELAY                 (500u)
+#define SystemCoreClock                  system_core_clock
+#define DUMMY_NOP()                      {__NOP();__NOP();__NOP();__NOP();__NOP(); \
+                                          __NOP();__NOP();__NOP();__NOP();__NOP(); \
+                                          __NOP();__NOP();__NOP();__NOP();__NOP(); \
+                                          __NOP();__NOP();__NOP();__NOP();__NOP();}
+
+/**
+  * @}
+  */
+
+/** @defgroup AT32F413_system_exported_variables
+  * @{
+  */
+
+extern unsigned int system_core_clock; /*!< system clock frequency (core clock) */
+
+/**
+  * @}
+  */
+
+/** @defgroup AT32F413_system_exported_functions
+  * @{
+  */
+
+extern void SystemInit(void);
+extern void system_core_clock_update(void);
+
+/**
+  * @}
+  */
+
+/**
+  * @}
+  */
+
+/**
+  * @}
+  */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif

+ 414 - 0
libraries/cmsis/dsp/ComputeLibrary/Include/NEMath.h

@@ -0,0 +1,414 @@
+/*
+ * Copyright (c) 2016, 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEMATH_H__
+#define __ARM_COMPUTE_NEMATH_H__
+
+
+#if defined(ARM_MATH_NEON)
+/** Calculate floor of a vector.
+ *
+ * @param[in] val Input vector value in F32 format.
+ *
+ * @return The calculated floor vector.
+ */
+static inline float32x4_t vfloorq_f32(float32x4_t val);
+
+/** Calculate inverse square root.
+ *
+ * @param[in] x Input value.
+ *
+ * @return The calculated inverse square root.
+ */
+static inline float32x2_t vinvsqrt_f32(float32x2_t x);
+
+/** Calculate inverse square root.
+ *
+ * @param[in] x Input value.
+ *
+ * @return The calculated inverse square root.
+ */
+static inline float32x4_t vinvsqrtq_f32(float32x4_t x);
+
+/** Calculate reciprocal.
+ *
+ * @param[in] x Input value.
+ *
+ * @return The calculated reciprocal.
+ */
+static inline float32x2_t vinv_f32(float32x2_t x);
+
+/** Calculate reciprocal.
+ *
+ * @param[in] x Input value.
+ *
+ * @return The calculated reciprocal.
+ */
+static inline float32x4_t vinvq_f32(float32x4_t x);
+
+/** Perform a 7th degree polynomial approximation using Estrin's method.
+ *
+ * @param[in] x      Input vector value in F32 format.
+ * @param[in] coeffs Polynomial coefficients table. (array of flattened float32x4_t vectors)
+ *
+ * @return The calculated approximation.
+ */
+static inline float32x4_t vtaylor_polyq_f32(float32x4_t x, const float32_t *coeffs);
+
+/** Calculate exponential
+ *
+ * @param[in] x Input vector value in F32 format.
+ *
+ * @return The calculated exponent.
+ */
+static inline float32x4_t vexpq_f32(float32x4_t x);
+
+/** Calculate logarithm
+ *
+ * @param[in] x Input vector value in F32 format.
+ *
+ * @return The calculated logarithm.
+ */
+static inline float32x4_t vlogq_f32(float32x4_t x);
+
+/** Calculate hyperbolic tangent.
+ *
+ * tanh(x) = (e^2x - 1)/(e^2x + 1)
+ *
+ * @note We clamp x to [-5,5] to avoid overflowing issues.
+ *
+ * @param[in] val Input vector value in F32 format.
+ *
+ * @return The calculated Hyperbolic Tangent.
+ */
+static inline float32x4_t vtanhq_f32(float32x4_t val);
+
+/** Calculate n power of a number.
+ *
+ * pow(x,n) = e^(n*log(x))
+ *
+ * @param[in] val Input vector value in F32 format.
+ * @param[in] n   Powers to raise the input to.
+ *
+ * @return The calculated power.
+ */
+static inline float32x4_t vpowq_f32(float32x4_t val, float32x4_t n);
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+/** Calculate hyperbolic tangent.
+ *
+ * tanh(x) = (e^2x - 1)/(e^2x + 1)
+ *
+ * @note We clamp x to [-5,5] to avoid overflowing issues.
+ *
+ * @param[in] val Input vector value in F32 format.
+ *
+ * @return The calculated Hyperbolic Tangent.
+ */
+static inline float16x8_t vtanhq_f16(float16x8_t val);
+
+/** Calculate reciprocal.
+ *
+ * @param[in] x Input value.
+ *
+ * @return The calculated reciprocal.
+ */
+static inline float16x4_t vinv_f16(float16x4_t x);
+
+/** Calculate reciprocal.
+ *
+ * @param[in] x Input value.
+ *
+ * @return The calculated reciprocal.
+ */
+static inline float16x8_t vinvq_f16(float16x8_t x);
+
+/** Calculate inverse square root.
+ *
+ * @param[in] x Input value.
+ *
+ * @return The calculated inverse square root.
+ */
+static inline float16x4_t vinvsqrt_f16(float16x4_t x);
+
+/** Calculate inverse square root.
+ *
+ * @param[in] x Input value.
+ *
+ * @return The calculated inverse square root.
+ */
+static inline float16x8_t vinvsqrtq_f16(float16x8_t x);
+
+/** Calculate exponential
+ *
+ * @param[in] x Input vector value in F16 format.
+ *
+ * @return The calculated exponent.
+ */
+static inline float16x8_t vexpq_f16(float16x8_t x);
+
+/** Calculate n power of a number.
+ *
+ * pow(x,n) = e^(n*log(x))
+ *
+ * @param[in] val Input vector value in F16 format.
+ * @param[in] n   Powers to raise the input to.
+ *
+ * @return The calculated power.
+ */
+static inline float16x8_t vpowq_f16(float16x8_t val, float16x8_t n);
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+/** Exponent polynomial coefficients */
+extern const float32_t exp_tab[4*8];
+
+
+/** Logarithm polynomial coefficients */
+extern const float32_t log_tab[4*8];
+
+#ifndef DOXYGEN_SKIP_THIS
+inline float32x4_t vfloorq_f32(float32x4_t val)
+{
+    static const float32_t CONST_1[4] = {1.f,1.f,1.f,1.f};
+
+    const int32x4_t   z = vcvtq_s32_f32(val);
+    const float32x4_t r = vcvtq_f32_s32(z);
+
+    return vbslq_f32(vcgtq_f32(r, val), vsubq_f32(r, vld1q_f32(CONST_1)), r);
+}
+
+inline float32x2_t vinvsqrt_f32(float32x2_t x)
+{
+    float32x2_t sqrt_reciprocal = vrsqrte_f32(x);
+    sqrt_reciprocal             = vmul_f32(vrsqrts_f32(vmul_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+    sqrt_reciprocal             = vmul_f32(vrsqrts_f32(vmul_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+
+    return sqrt_reciprocal;
+}
+
+inline float32x4_t vinvsqrtq_f32(float32x4_t x)
+{
+    float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
+    sqrt_reciprocal             = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+    sqrt_reciprocal             = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+
+    return sqrt_reciprocal;
+}
+
+inline float32x2_t vinv_f32(float32x2_t x)
+{
+    float32x2_t recip = vrecpe_f32(x);
+    recip             = vmul_f32(vrecps_f32(x, recip), recip);
+    recip             = vmul_f32(vrecps_f32(x, recip), recip);
+    return recip;
+}
+
+inline float32x4_t vinvq_f32(float32x4_t x)
+{
+    float32x4_t recip = vrecpeq_f32(x);
+    recip             = vmulq_f32(vrecpsq_f32(x, recip), recip);
+    recip             = vmulq_f32(vrecpsq_f32(x, recip), recip);
+    return recip;
+}
+
+inline float32x4_t vtaylor_polyq_f32(float32x4_t x, const float32_t *coeffs)
+{
+    float32x4_t A   = vmlaq_f32(vld1q_f32(&coeffs[4*0]), vld1q_f32(&coeffs[4*4]), x);
+    float32x4_t B   = vmlaq_f32(vld1q_f32(&coeffs[4*2]), vld1q_f32(&coeffs[4*6]), x);
+    float32x4_t C   = vmlaq_f32(vld1q_f32(&coeffs[4*1]), vld1q_f32(&coeffs[4*5]), x);
+    float32x4_t D   = vmlaq_f32(vld1q_f32(&coeffs[4*3]), vld1q_f32(&coeffs[4*7]), x);
+    float32x4_t x2  = vmulq_f32(x, x);
+    float32x4_t x4  = vmulq_f32(x2, x2);
+    float32x4_t res = vmlaq_f32(vmlaq_f32(A, B, x2), vmlaq_f32(C, D, x2), x4);
+    return res;
+}
+
+inline float32x4_t vexpq_f32(float32x4_t x)
+{
+    static const float32_t CONST_LN2[4]          = {0.6931471805f,0.6931471805f,0.6931471805f,0.6931471805f}; // ln(2)
+    static const float32_t CONST_INV_LN2[4]      = {1.4426950408f,1.4426950408f,1.4426950408f,1.4426950408f}; // 1/ln(2)
+    static const float32_t CONST_0[4]            = {0.f,0.f,0.f,0.f};
+    static const int32_t   CONST_NEGATIVE_126[4] = {-126,-126,-126,-126};
+
+    // Perform range reduction [-log(2),log(2)]
+    int32x4_t   m   = vcvtq_s32_f32(vmulq_f32(x, vld1q_f32(CONST_INV_LN2)));
+    float32x4_t val = vmlsq_f32(x, vcvtq_f32_s32(m), vld1q_f32(CONST_LN2));
+
+    // Polynomial Approximation
+    float32x4_t poly = vtaylor_polyq_f32(val, exp_tab);
+
+    // Reconstruct
+    poly = vreinterpretq_f32_s32(vqaddq_s32(vreinterpretq_s32_f32(poly), vqshlq_n_s32(m, 23)));
+    poly = vbslq_f32(vcltq_s32(m, vld1q_s32(CONST_NEGATIVE_126)), vld1q_f32(CONST_0), poly);
+
+    return poly;
+}
+
+inline float32x4_t vlogq_f32(float32x4_t x)
+{
+    static const int32_t   CONST_127[4] = {127,127,127,127};           // 127
+    static const float32_t CONST_LN2[4] = {0.6931471805f,0.6931471805f,0.6931471805f,0.6931471805f}; // ln(2)
+
+    // Extract exponent
+    int32x4_t   m   = vsubq_s32(vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23)), vld1q_s32(CONST_127));
+    float32x4_t val = vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23)));
+
+    // Polynomial Approximation
+    float32x4_t poly = vtaylor_polyq_f32(val, log_tab);
+
+    // Reconstruct
+    poly = vmlaq_f32(poly, vcvtq_f32_s32(m), vld1q_f32(CONST_LN2));
+
+    return poly;
+}
+
+inline float32x4_t vtanhq_f32(float32x4_t val)
+{
+    static const float32_t CONST_1[4]        = {1.f,1.f,1.f,1.f};
+    static const float32_t CONST_2[4]        = {2.f,2.f,2.f,2.f};
+    static const float32_t CONST_MIN_TANH[4] = {-10.f,-10.f,-10.f,-10.f};
+    static const float32_t CONST_MAX_TANH[4] = {10.f,10.f,10.f,10.f};
+
+    float32x4_t x     = vminq_f32(vmaxq_f32(val, vld1q_f32(CONST_MIN_TANH)), vld1q_f32(CONST_MAX_TANH));
+    float32x4_t exp2x = vexpq_f32(vmulq_f32(vld1q_f32(CONST_2), x));
+    float32x4_t num   = vsubq_f32(exp2x, vld1q_f32(CONST_1));
+    float32x4_t den   = vaddq_f32(exp2x, vld1q_f32(CONST_1));
+    float32x4_t tanh  = vmulq_f32(num, vinvq_f32(den));
+    return tanh;
+}
+
+inline float32x4_t vpowq_f32(float32x4_t val, float32x4_t n)
+{
+    return vexpq_f32(vmulq_f32(n, vlogq_f32(val)));
+}
+#endif /* DOXYGEN_SKIP_THIS */
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+/** Exponent polynomial coefficients */
+/** Logarithm polynomial coefficients */
+#ifndef DOXYGEN_SKIP_THIS
+inline float16x8_t vfloorq_f16(float16x8_t val)
+{
+    static const float16_t CONST_1[8] = {1.f,1.f,1.f,1.f,1.f,1.f,1.f,1.f};
+
+    const int16x8_t   z = vcvtq_s16_f16(val);
+    const float16x8_t r = vcvtq_f16_s16(z);
+
+    return vbslq_f16(vcgtq_f16(r, val), vsubq_f16(r, vld1q_f16(CONST_1)), r);
+}
+inline float16x4_t vinvsqrt_f16(float16x4_t x)
+{
+    float16x4_t sqrt_reciprocal = vrsqrte_f16(x);
+    sqrt_reciprocal             = vmul_f16(vrsqrts_f16(vmul_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+    sqrt_reciprocal             = vmul_f16(vrsqrts_f16(vmul_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+    return sqrt_reciprocal;
+}
+
+inline float16x8_t vinvsqrtq_f16(float16x8_t x)
+{
+    float16x8_t sqrt_reciprocal = vrsqrteq_f16(x);
+    sqrt_reciprocal             = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+    sqrt_reciprocal             = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+    return sqrt_reciprocal;
+}
+
+inline float16x4_t vinv_f16(float16x4_t x)
+{
+    float16x4_t recip = vrecpe_f16(x);
+    recip             = vmul_f16(vrecps_f16(x, recip), recip);
+    recip             = vmul_f16(vrecps_f16(x, recip), recip);
+    return recip;
+}
+
+inline float16x8_t vinvq_f16(float16x8_t x)
+{
+    float16x8_t recip = vrecpeq_f16(x);
+    recip             = vmulq_f16(vrecpsq_f16(x, recip), recip);
+    recip             = vmulq_f16(vrecpsq_f16(x, recip), recip);
+    return recip;
+}
+
+inline float16x8_t vtanhq_f16(float16x8_t val)
+{
+    const float16_t CONST_1[8]        = {1.f,1.f,1.f,1.f,1.f,1.f,1.f,1.f};
+    const float16_t CONST_2[8]        = {2.f,2.f,2.f,2.f,2.f,2.f,2.f,2.f};
+    const float16_t CONST_MIN_TANH[8] = {-10.f,-10.f,-10.f,-10.f,-10.f,-10.f,-10.f,-10.f};
+    const float16_t CONST_MAX_TANH[8] = {10.f,10.f,10.f,10.f,10.f,10.f,10.f,10.f};
+
+    const float16x8_t x     = vminq_f16(vmaxq_f16(val, vld1q_f16(CONST_MIN_TANH)), vld1q_f16(CONST_MAX_TANH));
+    const float16x8_t exp2x = vexpq_f16(vmulq_f16(vld1q_f16(CONST_2), x));
+    const float16x8_t num   = vsubq_f16(exp2x, vld1q_f16(CONST_1));
+    const float16x8_t den   = vaddq_f16(exp2x, vld1q_f16(CONST_1));
+    const float16x8_t tanh  = vmulq_f16(num, vinvq_f16(den));
+    return tanh;
+}
+
+inline float16x8_t vtaylor_polyq_f16(float16x8_t x, const float16_t *coeffs)
+{
+    const float16x8_t A   = vaddq_f16(&coeffs[8*0], vmulq_f16(&coeffs[8*4], x));
+    const float16x8_t B   = vaddq_f16(&coeffs[8*2], vmulq_f16(&coeffs[8*6], x));
+    const float16x8_t C   = vaddq_f16(&coeffs[8*1], vmulq_f16(&coeffs[8*5], x));
+    const float16x8_t D   = vaddq_f16(&coeffs[8*3], vmulq_f16(&coeffs[8*7], x));
+    const float16x8_t x2  = vmulq_f16(x, x);
+    const float16x8_t x4  = vmulq_f16(x2, x2);
+    const float16x8_t res = vaddq_f16(vaddq_f16(A, vmulq_f16(B, x2)), vmulq_f16(vaddq_f16(C, vmulq_f16(D, x2)), x4));
+    return res;
+}
+
+inline float16x8_t vexpq_f16(float16x8_t x)
+{
+    // TODO (COMPMID-1535) : Revisit FP16 approximations
+    const float32x4_t x_high = vcvt_f32_f16(vget_high_f16(x));
+    const float32x4_t x_low  = vcvt_f32_f16(vget_low_f16(x));
+
+    const float16x8_t res = vcvt_high_f16_f32(vcvt_f16_f32(vexpq_f32(x_low)), vexpq_f32(x_high));
+    return res;
+}
+
+inline float16x8_t vlogq_f16(float16x8_t x)
+{
+    // TODO (COMPMID-1535) : Revisit FP16 approximations
+    const float32x4_t x_high = vcvt_f32_f16(vget_high_f16(x));
+    const float32x4_t x_low  = vcvt_f32_f16(vget_low_f16(x));
+
+    const float16x8_t res = vcvt_high_f16_f32(vcvt_f16_f32(vlogq_f32(x_low)), vlogq_f32(x_high));
+    return res;
+}
+
+inline float16x8_t vpowq_f16(float16x8_t val, float16x8_t n)
+{
+    // TODO (giaiod01) - COMPMID-1535
+    float32x4_t n0_f32   = vcvt_f32_f16(vget_low_f16(n));
+    float32x4_t n1_f32   = vcvt_f32_f16(vget_high_f16(n));
+    float32x4_t val0_f32 = vcvt_f32_f16(vget_low_f16(val));
+    float32x4_t val1_f32 = vcvt_f32_f16(vget_high_f16(val));
+
+    float32x4_t res0_f32 = vexpq_f32(vmulq_f32(n0_f32, vlogq_f32(val0_f32)));
+    float32x4_t res1_f32 = vexpq_f32(vmulq_f32(n1_f32, vlogq_f32(val1_f32)));
+
+    return vcombine_f16(vcvt_f16_f32(res0_f32), vcvt_f16_f32(res1_f32));
+}
+#endif /* DOXYGEN_SKIP_THIS */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+#endif
+#endif /* __ARM_COMPUTE_NEMATH_H__ */

+ 21 - 0
libraries/cmsis/dsp/ComputeLibrary/LICENSE.txt

@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2017-2019 ARM Software
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

+ 19 - 0
libraries/cmsis/dsp/ComputeLibrary/README.md

@@ -0,0 +1,19 @@
+README
+======
+
+This folder is containing two files imported, and slightly modified, from the ComputeLibrary:
+
+    NEMath.h and arm_cl_tables.c 
+
+In the original compute library, there are instead two other files:
+
+    NEMath.h and NEMath.inl
+
+NEMath.inl is included from NEMath.h whereas in this CMSIS DSP implementation, there is no NEMath.inl and its content is copied into NEMath.h
+
+The tables contained in NEMath.inl have been moved to arm_cl_tables.c and finally the files are in C for the CMSIS DSP library and in C++ in the original Compute Library.
+
+Otherwise, the features and implementations are the same : a few optimized Neon functions.
+
+The license covering those files is different : It is a MIT license.
+Other parts of the CMSIS-DSP are covered with an Apache-2.0 license.

+ 55 - 0
libraries/cmsis/dsp/ComputeLibrary/Source/arm_cl_tables.c

@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016, 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_math.h"
+#include "NEMath.h"
+
+#if defined(ARM_MATH_NEON)
+
+/** Exponent polynomial coefficients */
+const float32_t exp_tab[4*8] =
+{
+        1.f,1.f,1.f,1.f,
+        0.0416598916054f,0.0416598916054f,0.0416598916054f,0.0416598916054f,
+        0.500000596046f,0.500000596046f,0.500000596046f,0.500000596046f,
+        0.0014122662833f,0.0014122662833f,0.0014122662833f,0.0014122662833f,
+        1.00000011921f,1.00000011921f,1.00000011921f,1.00000011921f,
+        0.00833693705499f,0.00833693705499f,0.00833693705499f,0.00833693705499f,
+        0.166665703058f,0.166665703058f,0.166665703058f,0.166665703058f,
+        0.000195780929062f,0.000195780929062f,0.000195780929062f,0.000195780929062f
+};
+
+/** Logarithm polynomial coefficients */
+const float32_t log_tab[4*8] =
+{
+        -2.29561495781f,-2.29561495781f,-2.29561495781f,-2.29561495781f,
+        -2.47071170807f,-2.47071170807f,-2.47071170807f,-2.47071170807f,
+        -5.68692588806f,-5.68692588806f,-5.68692588806f,-5.68692588806f,
+        -0.165253549814f,-0.165253549814f,-0.165253549814f,-0.165253549814f,
+        5.17591238022f,5.17591238022f,5.17591238022f,5.17591238022f,
+        0.844007015228f,0.844007015228f,0.844007015228f,0.844007015228f,
+        4.58445882797f,4.58445882797f,4.58445882797f,4.58445882797f,
+        0.0141278216615f,0.0141278216615f,0.0141278216615f,0.0141278216615f
+};
+
+#endif

+ 200 - 0
libraries/cmsis/dsp/PrivateInclude/arm_sorting.h

@@ -0,0 +1,200 @@
+/******************************************************************************
+ * @file     arm_sorting.h
+ * @brief    Private header file for CMSIS DSP Library
+ * @version  V1.7.0
+ * @date     2019
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2019 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _ARM_SORTING_H_
+#define _ARM_SORTING_H_
+
+#include "arm_math.h"
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+  /**
+   * @param[in]  S          points to an instance of the sorting structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_bubble_sort_f32(
+    const arm_sort_instance_f32 * S,
+          float32_t * pSrc,
+          float32_t * pDst,
+    uint32_t blockSize);
+
+   /**
+   * @param[in]  S          points to an instance of the sorting structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_heap_sort_f32(
+    const arm_sort_instance_f32 * S,
+          float32_t * pSrc,
+          float32_t * pDst,
+    uint32_t blockSize);
+
+  /**
+   * @param[in]  S          points to an instance of the sorting structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_insertion_sort_f32(
+    const arm_sort_instance_f32 * S,
+          float32_t *pSrc,
+          float32_t* pDst,
+    uint32_t blockSize);
+
+  /**
+   * @param[in]  S          points to an instance of the sorting structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_quick_sort_f32(
+    const arm_sort_instance_f32 * S,
+          float32_t * pSrc,
+          float32_t * pDst,
+    uint32_t blockSize);
+
+  /**
+   * @param[in]  S          points to an instance of the sorting structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_selection_sort_f32(
+    const arm_sort_instance_f32 * S,
+          float32_t * pSrc,
+          float32_t * pDst,
+    uint32_t blockSize);
+
+  /**
+   * @param[in]  S          points to an instance of the sorting structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_bitonic_sort_f32(
+    const arm_sort_instance_f32 * S,
+          float32_t * pSrc,
+          float32_t * pDst,
+          uint32_t blockSize);
+
+#if defined(ARM_MATH_NEON)
+
+#define vtrn256_128q(a, b)                   \
+do {                                         \
+	float32x4_t vtrn128_temp = a.val[1]; \
+	a.val[1] = b.val[0];                 \
+	b.val[0] = vtrn128_temp ;            \
+} while (0)
+
+#define vtrn128_64q(a, b)           \
+do {                                \
+	float32x2_t ab, cd, ef, gh; \
+	ab = vget_low_f32(a);	    \
+	ef = vget_low_f32(b);	    \
+	cd = vget_high_f32(a);	    \
+	gh = vget_high_f32(b);      \
+	a = vcombine_f32(ab, ef);   \
+	b = vcombine_f32(cd, gh);   \
+} while (0)
+
+#define vtrn256_64q(a, b)                  \
+do {                                       \
+	float32x2_t a_0, a_1, a_2, a_3;    \
+	float32x2_t b_0, b_1, b_2, b_3;    \
+	a_0 = vget_low_f32(a.val[0]);      \
+	a_1 = vget_high_f32(a.val[0]);     \
+	a_2 = vget_low_f32(a.val[1]);      \
+	a_3 = vget_high_f32(a.val[1]);     \
+	b_0 = vget_low_f32(b.val[0]);      \
+	b_1 = vget_high_f32(b.val[0]);     \
+	b_2 = vget_low_f32(b.val[1]);      \
+	b_3 = vget_high_f32(b.val[1]);     \
+	a.val[0] = vcombine_f32(a_0, b_0); \
+	a.val[1] = vcombine_f32(a_2, b_2); \
+	b.val[0] = vcombine_f32(a_1, b_1); \
+	b.val[1] = vcombine_f32(a_3, b_3); \
+} while (0)
+
+#define vtrn128_32q(a, b)                               \
+do {                                                    \
+	float32x4x2_t vtrn32_tmp = vtrnq_f32((a), (b)); \
+	(a) = vtrn32_tmp.val[0];                        \
+	(b) = vtrn32_tmp.val[1];                        \
+} while (0)
+
+#define vtrn256_32q(a, b)               \
+do {                                    \
+	float32x4x2_t vtrn32_tmp_1 = vtrnq_f32((a.val[0]), (b.val[0])); \
+	float32x4x2_t vtrn32_tmp_2 = vtrnq_f32((a.val[1]), (b.val[1])); \
+	a.val[0] = vtrn32_tmp_1.val[0]; \
+	a.val[1] = vtrn32_tmp_2.val[0]; \
+	b.val[0] = vtrn32_tmp_1.val[1]; \
+	b.val[1] = vtrn32_tmp_2.val[1]; \
+} while (0)
+
+#define vminmaxq(a, b)                    \
+	do {                              \
+	float32x4_t minmax_tmp = (a);     \
+	(a) = vminq_f32((a), (b));        \
+	(b) = vmaxq_f32(minmax_tmp, (b)); \
+} while (0)
+
+#define vminmax256q(a, b)                         \
+	do {                                      \
+	float32x4x2_t minmax256_tmp = (a);        \
+	a.val[0] = vminq_f32(a.val[0], b.val[0]); \
+	a.val[1] = vminq_f32(a.val[1], b.val[1]); \
+	b.val[0] = vmaxq_f32(minmax256_tmp.val[0], b.val[0]); \
+	b.val[1] = vmaxq_f32(minmax256_tmp.val[1], b.val[1]); \
+} while (0)
+
+#define vrev128q_f32(a) \
+        vcombine_f32(vrev64_f32(vget_high_f32(a)), vrev64_f32(vget_low_f32(a)))
+
+#define vrev256q_f32(a)     \
+	do {                \
+        float32x4_t rev_tmp = vcombine_f32(vrev64_f32(vget_high_f32(a.val[0])), vrev64_f32(vget_low_f32(a.val[0]))); \
+	a.val[0] = vcombine_f32(vrev64_f32(vget_high_f32(a.val[1])), vrev64_f32(vget_low_f32(a.val[1])));  \
+	a.val[1] = rev_tmp; \
+} while (0)
+
+#define vldrev128q_f32(a, p) \
+	do {                 \
+	a = vld1q_f32(p);    \
+	a = vrev128q_f32(a); \
+} while (0)
+
+#endif /* ARM_MATH_NEON */
+
+#ifdef   __cplusplus
+}
+#endif
+
+#endif /* _ARM_SORTING_H */

+ 58 - 0
libraries/cmsis/dsp/PrivateInclude/arm_vec_fft.h

@@ -0,0 +1,58 @@
+/******************************************************************************
+ * @file     arm_vec_fft.h
+ * @brief    Private header file for CMSIS DSP Library
+ * @version  V1.7.0
+ * @date     07. January 2020
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _ARM_VEC_FFT_H_
+#define _ARM_VEC_FFT_H_
+
+#include "arm_math.h"
+#include "arm_helium_utils.h"
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+#if (defined(ARM_MATH_MVEF) || defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#define MVE_CMPLX_ADD_A_ixB(A, B)           vcaddq_rot90(A,B)
+#define MVE_CMPLX_SUB_A_ixB(A,B)            vcaddq_rot270(A,B)
+#define MVE_CMPLX_MULT_FLT_AxB(A,B)         vcmlaq_rot90(vcmulq(A, B), A, B)
+#define MVE_CMPLX_MULT_FLT_Conj_AxB(A,B)    vcmlaq_rot270(vcmulq(A, B), A, B)
+
+#define MVE_CMPLX_MULT_FX_AxB(A,B)          vqdmladhxq(vqdmlsdhq((__typeof(A))vuninitializedq_s32(), A, B), A, B);
+#define MVE_CMPLX_MULT_FX_AxConjB(A,B)      vqdmladhq(vqdmlsdhxq((__typeof(A))vuninitializedq_s32(), A, B), A, B);
+
+#define MVE_CMPLX_ADD_FX_A_ixB(A, B)        vhcaddq_rot90(A,B)
+#define MVE_CMPLX_SUB_FX_A_ixB(A,B)         vhcaddq_rot270(A,B)
+
+
+#endif /* (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)*/
+
+
+#ifdef   __cplusplus
+}
+#endif
+
+
+#endif /* _ARM_VEC_FFT_H_ */

+ 1661 - 0
libraries/cmsis/dsp/PrivateInclude/arm_vec_filtering.h

@@ -0,0 +1,1661 @@
+/******************************************************************************
+ * @file     arm_vec_filtering.h
+ * @brief    Private header file for CMSIS DSP Library
+ * @version  V1.7.0
+ * @date     30. October 2019
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2019 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _ARM_VEC_FILTERING_H_
+#define _ARM_VEC_FILTERING_H_
+
+#include "arm_math.h"
+#include "arm_helium_utils.h"
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+#if (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#define MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_F32(acc0, acc1, acc2, acc3, pX, pY, count)\
+{                                                                                     \
+    float32_t const *pSrcX, *pSrcY;                                                   \
+    f32x4_t   acc0Vec, acc1Vec, acc2Vec, acc3Vec, xVec, yVec;                         \
+    uint32_t    k;                                                                    \
+                                                                                      \
+    acc0Vec = vdupq_n_f32(0.0f);                                                      \
+    acc1Vec = vdupq_n_f32(0.0f);                                                      \
+    acc2Vec = vdupq_n_f32(0.0f);                                                      \
+    acc3Vec = vdupq_n_f32(0.0f);                                                      \
+    pSrcX = (float32_t const *) pX;                                                   \
+    pSrcY = (float32_t const *) pY;                                                   \
+    k = count >> 2;                                                                   \
+                                                                                      \
+    while (k > 0U)                                                                    \
+    {                                                                                 \
+        yVec = vld1q(pSrcY);                                                          \
+        pSrcY += 4;                                                                   \
+        xVec = vldrwq_f32(&pSrcX[1]);                                                 \
+        acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec);                                     \
+        xVec = vldrwq_f32(&pSrcX[2]);                                                 \
+        acc2Vec = vfmaq_f32(acc2Vec, xVec, yVec);                                     \
+        xVec = vldrwq_f32(&pSrcX[3]);                                                 \
+        acc3Vec = vfmaq_f32(acc3Vec, xVec, yVec);                                     \
+        xVec = vld1q(pSrcX);                                                          \
+        pSrcX += 4;                                                                   \
+        acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec);                                     \
+        /*  Decrement the loop counter   */                                           \
+        k--;                                                                          \
+    }                                                                                 \
+    /* loop + tail predication expected here  */                                      \
+    k = count % 0x4U;                                                                 \
+    if (k > 0U)                                                                       \
+    {                                                                                 \
+        mve_pred16_t p0 = vctp32q(k);                                                 \
+        yVec = vld1q(pSrcY);                                                          \
+        pSrcY += 4;                                                                   \
+        xVec = vldrwq_f32(&pSrcX[1]);                                                 \
+        acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0);                               \
+        xVec = vldrwq_f32(&pSrcX[2]);                                                 \
+        acc2Vec = vfmaq_m_f32(acc2Vec, xVec, yVec, p0);                               \
+        xVec = vldrwq_f32(&pSrcX[3]);                                                 \
+        acc3Vec = vfmaq_m_f32(acc3Vec, xVec, yVec, p0);                               \
+        xVec = vld1q(pSrcX);                                                          \
+        pSrcX += 4;                                                                   \
+        acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0);                               \
+    }                                                                                 \
+                                                                                      \
+    acc0 = vecAddAcrossF32Mve(acc0Vec);                                               \
+    acc1 = vecAddAcrossF32Mve(acc1Vec);                                               \
+    acc2 = vecAddAcrossF32Mve(acc2Vec);                                               \
+    acc3 = vecAddAcrossF32Mve(acc3Vec);                                               \
+}
+
+#define MVE_INTR_CORR_SINGLE_F32(acc, pX, pY, count) \
+{                                                    \
+    float32_t const *pSrcX, *pSrcY;                  \
+    f32x4_t   accVec, xVec, yVec;                    \
+    uint32_t    k;                                   \
+                                                     \
+    accVec = vdupq_n_f32(0.0f);                      \
+    pSrcX = (float32_t const *) pX;                  \
+    pSrcY = (float32_t const *) pY;                  \
+    k = count >> 2;                                  \
+                                                     \
+    while (k > 0U)                                   \
+    {                                                \
+        yVec = vld1q(pSrcY);                         \
+        pSrcY += 4;                                  \
+        xVec = vld1q(pSrcX);                         \
+        pSrcX += 4;                                  \
+        accVec = vfmaq_f32(accVec, xVec, yVec);      \
+        /*  Decrement the loop counter   */          \
+        k--;                                         \
+    }                                                \
+    /* Loop with tail predication expected here  */  \
+    k = count % 0x4U;                                \
+    if (k > 0U)                                      \
+    {                                                \
+        mve_pred16_t p0 = vctp32q(k);                \
+        yVec = vld1q(pSrcY);                         \
+        pSrcY += 4;                                  \
+        xVec = vld1q(pSrcX);                         \
+        pSrcX += 4;                                  \
+        accVec = vfmaq_m_f32(accVec, xVec, yVec, p0);\
+    }                                                \
+    acc = vecAddAcrossF32Mve(accVec);                \
+}
+
+#define MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_F32(acc0, acc1, pX, pY, count)\
+{                                                                       \
+    float32_t const *pSrcX, *pSrcY;                                     \
+    f32x4_t   acc0Vec, acc1Vec, xVec, yVec;                             \
+    uint32_t    k;                                                      \
+                                                                        \
+    acc0Vec = vdupq_n_f32(0.0f);                                        \
+    acc1Vec = vdupq_n_f32(0.0f);                                        \
+    pSrcX = (float32_t const *) pX;                                     \
+    pSrcY = (float32_t const *) pY;                                     \
+    k = (count-1) >> 2;                                                 \
+                                                                        \
+    while (k > 0U)                                                      \
+    {                                                                   \
+        yVec = vld1q(pSrcY);                                            \
+        pSrcY += 4;                                                     \
+        xVec = vldrwq_f32(&pSrcX[1]);                                   \
+        acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec);                       \
+        xVec = vld1q(pSrcX);                                            \
+        pSrcX += 4;                                                     \
+        acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec);                       \
+        /*  Decrement the loop counter   */                             \
+        k--;                                                            \
+    }                                                                   \
+    /* use predication to finalize MAC sum */                           \
+    /* acc1 requires exact number of sample (count-1)  */               \
+    /* disable extra lanes in final MAC computation  */                 \
+    k = (count-1) % 0x4U;                                               \
+    mve_pred16_t p0 = vctp32q(k);                                       \
+    yVec = vld1q(pSrcY);                                                \
+    pSrcY += 4;                                                         \
+    xVec = vldrwq_f32(&pSrcX[1]);                                       \
+    acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0);                     \
+    /* acc0 requires 1 additional sample  (count) */                    \
+    /* so add 1 to unmask an extra lane  in final MAC computation  */   \
+    p0 = vctp32q(k+1);                                                  \
+    xVec = vld1q(pSrcX);                                                \
+    pSrcX += 4;                                                         \
+    acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0);                     \
+                                                                        \
+    acc0 = vecAddAcrossF32Mve(acc0Vec);                                 \
+    acc1 = vecAddAcrossF32Mve(acc1Vec);                                 \
+}
+
+#define MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_F32(acc0, acc1, pX, pY, count)\
+{                                                                         \
+    float32_t const *pSrcX, *pSrcY;                                       \
+    f32x4_t   acc0Vec, acc1Vec, xVec, yVec;                               \
+    uint32_t    k;                                                        \
+                                                                          \
+    acc0Vec = vdupq_n_f32(0.0f);                                          \
+    acc1Vec = vdupq_n_f32(0.0f);                                          \
+    pSrcX = (float32_t const *) pX;                                       \
+    pSrcY = (float32_t const *) pY;                                       \
+    k = count >> 2;                                                       \
+                                                                          \
+    while (k > 0U)                                                        \
+    {                                                                     \
+        yVec = vld1q(pSrcY);                                              \
+        pSrcY += 4;                                                       \
+        xVec = vldrwq_f32(&pSrcX[1]);                                     \
+        acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec);                         \
+        xVec = vld1q(pSrcX);                                              \
+        pSrcX += 4;                                                       \
+        acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec);                         \
+        /*  Decrement the loop counter   */                               \
+        k--;                                                              \
+    }                                                                     \
+    /* loop + tail predication expected here  */                          \
+    k = count % 0x4U;                                                     \
+    if (k > 0U)                                                           \
+    {                                                                     \
+        mve_pred16_t p0 = vctp32q(k);                                     \
+        yVec = vld1q(pSrcY);                                              \
+        pSrcY += 4;                                                       \
+        xVec = vldrwq_f32(&pSrcX[1]);                                     \
+        acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0);                   \
+        xVec = vld1q(pSrcX);                                              \
+        pSrcX += 4;                                                       \
+        acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0);                   \
+    }                                                                     \
+                                                                          \
+    acc0 = vecAddAcrossF32Mve(acc0Vec);                                   \
+    acc1 = vecAddAcrossF32Mve(acc1Vec);                                   \
+}
+
+#define MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_F32(acc0, acc1, pX, pY, count)\
+{                                                                       \
+    float32_t const *pSrcX, *pSrcY;                                     \
+    f32x4_t   acc0Vec, acc1Vec, xVec, yVec;                             \
+    uint32_t    k;                                                      \
+                                                                        \
+    acc0Vec = vdupq_n_f32(0.0f);                                        \
+    acc1Vec = vdupq_n_f32(0.0f);                                        \
+    pSrcX = (float32_t const *) pX;                                     \
+    pSrcY = (float32_t const *) pY;                                     \
+    k = count >> 2;                                                     \
+    while (k > 0U)                                                      \
+    {                                                                   \
+        xVec = vld1q(pSrcX);                                            \
+        pSrcX += 4;                                                     \
+        yVec = vldrwq_f32(&pSrcY[-1]);                                  \
+        acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec);                       \
+        yVec = vld1q(pSrcY);                                            \
+        pSrcY += 4;                                                     \
+        acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec);                       \
+        /*  Decrement the loop counter   */                             \
+        k--;                                                            \
+    }                                                                   \
+    k = count % 0x4U;                                                   \
+    /* use predication to finalize MAC sum */                           \
+    /* acc1 requires 1 additional sample  */                            \
+    /* so add 1 to unmask an extra lane  in final MAC computation  */   \
+    mve_pred16_t p0 = vctp32q(k+1);                                     \
+    xVec = vld1q(pSrcX);                                                \
+    pSrcX += 4;                                                         \
+    yVec = vldrwq_f32(&pSrcY[-1]);                                      \
+    acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec,p0);                      \
+    /* acc0 requires exact number of sample  */                         \
+    /* disable extra lanes in final MAC computation  */                 \
+    p0 = vctp32q(k);                                                    \
+    yVec = vld1q(pSrcY);                                                \
+    pSrcY += 4;                                                         \
+    acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec,p0);                      \
+                                                                        \
+    acc0 = vecAddAcrossF32Mve(acc0Vec);                                 \
+    acc1 = vecAddAcrossF32Mve(acc1Vec);                                 \
+}
+
+#define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_F32(acc0, acc1, pX, pY, count)                            \
+{                                                                                                   \
+    float32_t const *pSrcX;                                                                         \
+    f32x4_t   acc0Vec, acc1Vec, xVec, yVec;                                                       \
+    uint32_t    k;                                                                                  \
+                                                                                                    \
+    acc0Vec = vdupq_n_f32(0.0f);                                                                    \
+    acc1Vec = vdupq_n_f32(0.0f);                                                                    \
+    pSrcX = (float32_t const *) pX;                                                                 \
+    k = (count - 1) >> 2;                                                                           \
+                                                                                                    \
+    while (k > 0U)                                                                                  \
+    {                                                                                               \
+        /* note */                                                                                  \
+        /* could can be more efficient using Vector Scatter Store: */                               \
+        /* + pre-increment + WB */                                                                  \
+        /* To be revisited when intrinsic available */                                              \
+        /* SDCOMP-52618 */                                                                          \
+        yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);                                    \
+        pY-=4;                                                                                      \
+        xVec = vldrwq_f32(&pSrcX[1]);                                                               \
+        acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec);                                                   \
+        xVec = vld1q(pSrcX);  pSrcX += 4;                                               \
+        acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec);                                                   \
+        /*  Decrement the loop counter   */                                                         \
+        k--;                                                                                        \
+    }                                                                                               \
+    /* Loop with tail predication expected here  */                                                 \
+    k = (count - 1) % 0x4U;                                                                         \
+    mve_pred16_t p0 = vctp32q(k);                                                        \
+    yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);                                        \
+    xVec = vldrwq_f32(&pSrcX[1]);                                                                   \
+    acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0);                                                 \
+    xVec = vld1q(pSrcX);  pSrcX += 4;                                                   \
+    p0 = vctp32q(k+1);                                                                   \
+    acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0);                                                 \
+                                                                                                    \
+    acc0 = vecAddAcrossF32Mve(acc0Vec);                                                             \
+    acc1 = vecAddAcrossF32Mve(acc1Vec);                                                             \
+}
+
+#define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_F32(acc0, acc1, pX, pY, count)                          \
+{                                                                                                   \
+    float32_t const *pSrcX;                                                                         \
+    f32x4_t   acc0Vec, acc1Vec, xVec, yVec;                                                       \
+    uint32_t    k;                                                                                  \
+                                                                                                    \
+    acc0Vec = vdupq_n_f32(0.0f);                                                                    \
+    acc1Vec = vdupq_n_f32(0.0f);                                                                    \
+    pSrcX = (float32_t const *) pX;                                                                 \
+    k = count >> 2;                                                                                 \
+                                                                                                    \
+    while (k > 0U)                                                                                  \
+    {                                                                                               \
+        /* note */                                                                                  \
+        /* could can be more efficient using Vector Scatter Store: */                               \
+        /* + pre-increment + WB */                                                                  \
+        /* To be revisited when intrinsic available */                                              \
+        /* SDCOMP-52618 */                                                                          \
+        yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);                                    \
+        pY-=4;                                                                                      \
+        xVec = vldrwq_f32(&pSrcX[1]);                                                               \
+        acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec);                                                   \
+        xVec = vld1q(pSrcX);  pSrcX += 4;                                               \
+        acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec);                                                   \
+        /*  Decrement the loop counter   */                                                         \
+        k--;                                                                                        \
+    }                                                                                               \
+    /* Loop with tail predication expected here  */                                                 \
+    k = count % 0x4U;                                                                               \
+    if (k > 0U)                                                                                     \
+    {                                                                                               \
+        mve_pred16_t p0 = vctp32q(k);                                                    \
+        yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);                                    \
+        xVec = vldrwq_f32(&pSrcX[1]);                                                               \
+        acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0);                                             \
+        xVec = vld1q(pSrcX);  pSrcX += 4;                                               \
+        acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0);                                             \
+    }                                                                                               \
+    acc0 = vecAddAcrossF32Mve(acc0Vec);                                                             \
+    acc1 = vecAddAcrossF32Mve(acc1Vec);                                                             \
+}
+
+#define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_F32(acc0, acc1, pX, pY, count)                            \
+{                                                                                                   \
+    float32_t   const *pSrcX;                                                                       \
+    const float32_t  *pY1 = pY + 1;                                                                       \
+    f32x4_t   acc0Vec, acc1Vec, xVec, yVec;                                                       \
+    uint32_t    k;                                                                                  \
+                                                                                                    \
+    acc0Vec = vdupq_n_f32(0.0f);                                                                    \
+    acc1Vec = vdupq_n_f32(0.0f);                                                                    \
+    pSrcX = (float32_t const *) pX;                                                                 \
+    k = count >> 2;                                                                                 \
+                                                                                                    \
+    while (k > 0U)                                                                                  \
+    {                                                                                               \
+        xVec = vld1q(pSrcX);  pSrcX += 4;                                               \
+        yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);                                    \
+        pY-=4;                                                                                      \
+        acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec);                                                   \
+        yVec = vldrwq_gather_shifted_offset_f32(pY1, decrIdxVec);                                   \
+        pY1-=4;                                                                                     \
+        acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec);                                                   \
+        /*  Decrement the loop counter   */                                                         \
+        k--;                                                                                        \
+    }                                                                                               \
+    k = count % 0x4U;                                                                               \
+    /* use predication to finalize MAC sum */                                                       \
+    /* acc0 requires exact number of sample  */                                                     \
+    /* disable extra lanes in final MAC computation  */                                             \
+    mve_pred16_t p0 = vctp32q(k);                                                        \
+    xVec = vld1q(pSrcX);  pSrcX += 4;                                                   \
+    yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);                                        \
+    acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0);                                                 \
+    yVec = vldrwq_gather_shifted_offset_f32(pY1, decrIdxVec);                                       \
+    /* acc1 requires 1 additional sample  */                                                        \
+    /* so add 1 to unmask an extra lane  in final MAC computation  */                               \
+    p0 = vctp32q(k+1);                                                                   \
+    acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0);                                                 \
+                                                                                                    \
+    acc0 = vecAddAcrossF32Mve(acc0Vec);                                                             \
+    acc1 = vecAddAcrossF32Mve(acc1Vec);                                                             \
+}
+
+#define MVE_INTR_CONV_SINGLE_F32(acc, pX, pY, count)                                                \
+{                                                                                                   \
+    float32_t const *pSrcX;                                                                         \
+    f32x4_t   accVec, xVec, yVec;                                                                 \
+    uint32_t    k;                                                                                  \
+                                                                                                    \
+    accVec = vdupq_n_f32(0.0f);                                                                     \
+    pSrcX = (float32_t const *) pX;                                                                 \
+    k = count >> 2;                                                                                 \
+                                                                                                    \
+    while (k > 0U)                                                                                  \
+    {                                                                                               \
+        /* note */                                                                                  \
+        /* could can be more efficient using Vector Scatter Store: */                               \
+        /* + pre-increment + WB */                                                                  \
+        /* To be revisited when intrinsic available */                                              \
+        /* SDCOMP-52618 */                                                                          \
+        yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);                                    \
+        pY-=4;                                                                                      \
+        xVec = vld1q(pSrcX);  pSrcX += 4;                                               \
+        accVec = vfmaq_f32(accVec, xVec, yVec);                                                     \
+        /*  Decrement the loop counter   */                                                         \
+        k--;                                                                                        \
+    }                                                                                               \
+    /* Loop with tail predication expected here  */                                                 \
+    k = count % 0x4U;                                                                               \
+    if (k > 0U)                                                                                     \
+    {                                                                                               \
+        mve_pred16_t p0 = vctp32q(k);                                                    \
+        xVec = vld1q(pSrcX);  pSrcX += 4;                                               \
+        yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);                                    \
+        accVec = vfmaq_m_f32(accVec, xVec, yVec, p0);                                               \
+    }                                                                                               \
+    acc = vecAddAcrossF32Mve(accVec);                                                               \
+}
+
+#endif /* (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)*/
+
+#if (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM))
+
+#define MVE_INTR_CONV_SINGLE_Q31(acc, pX, pY, count)                                                \
+{                                                                                                   \
+    q31_t const *pSrcX;                                                                             \
+    q31x4_t   xVec, yVec;                                                                         \
+    uint32_t    k;                                                                                  \
+                                                                                                    \
+    pSrcX = (q31_t const *) pX;                                                                     \
+    k = count >> 2;                                                                                 \
+                                                                                                    \
+    while (k > 0U)                                                                                  \
+    {                                                                                               \
+        /* note */                                                                                  \
+        /* could can be more efficient using Vector Scatter Store: */                               \
+        /* + pre-increment + WB */                                                                  \
+        /* To be revisited when intrinsic available */                                              \
+        /* SDCOMP-52618 */                                                                          \
+        yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);                                    \
+        pY-=4;                                                                                      \
+        xVec = vld1q(pSrcX); pSrcX += 4;                                                \
+        acc = vmlaldavaq(acc, xVec, yVec);                                                          \
+        /*  Decrement the loop counter   */                                                         \
+        k--;                                                                                        \
+    }                                                                                               \
+    /* Loop with tail predication expected here  */                                                 \
+    k = count % 0x4U;                                                                               \
+    if (k > 0U)                                                                                     \
+    {                                                                                               \
+        mve_pred16_t p0 = vctp32q(k);                                                    \
+        xVec = vld1q(pSrcX); pSrcX += 4;                                                \
+        yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);                                    \
+        acc = vmlaldavaq_p(acc, xVec, yVec, p0);                                                    \
+    }                                                                                               \
+    acc = asrl(acc, 31);                                                                            \
+}
+
+
+
+#define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q31(acc0, acc1, pX, pY, count)                            \
+{                                                                                                   \
+    q31_t const *pSrcX;                                                                             \
+    const q31_t       *pY1 = pY + 1;                                                                      \
+    q31x4_t   xVec, yVec;                                                                         \
+    uint32_t    k;                                                                                  \
+                                                                                                    \
+    pSrcX = (q31_t const *) pX;                                                                     \
+    k = count >> 2;                                                                                 \
+                                                                                                    \
+    while (k > 0U)                                                                                  \
+    {                                                                                               \
+        xVec = vld1q(pSrcX); pSrcX += 4;                                                \
+        yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);                                    \
+        pY-=4;                                                                                      \
+        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                        \
+        yVec = vldrwq_gather_shifted_offset_s32(pY1, decrIdxVec);                                   \
+        pY1-=4;                                                                                     \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                        \
+        /*  Decrement the loop counter   */                                                         \
+        k--;                                                                                        \
+    }                                                                                               \
+    k = count % 0x4U;                                                                               \
+    /* use predication to finalize MAC sum */                                                       \
+    /* acc0 requires exact number of sample  */                                                     \
+    /* disable extra lanes in final MAC computation  */                                             \
+    mve_pred16_t p0 = vctp32q(k);                                                        \
+    xVec = vld1q(pSrcX); pSrcX += 4;                                                    \
+    yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);                                        \
+    acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                      \
+    yVec = vldrwq_gather_shifted_offset_s32(pY1, decrIdxVec);                                       \
+    /* acc1 requires 1 additional sample  */                                                        \
+    /* so add 1 to unmask an extra lane  in final MAC computation  */                               \
+    p0 = vctp32q(k+1);                                                                   \
+    acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                      \
+                                                                                                    \
+    acc0 = asrl(acc0, 31);                                                                          \
+    acc1 = asrl(acc1, 31);                                                                          \
+}
+
+
+
+
+#define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q31(acc0, acc1, pX, pY, count)\
+{                                                                       \
+    q31_t const *pSrcX;                                                 \
+    q31x4_t   xVec, yVec;                                               \
+    uint32_t    k;                                                      \
+                                                                        \
+    pSrcX = (q31_t const *) pX;                                         \
+    k = (count-1) >> 2;                                                 \
+                                                                        \
+    while (k > 0U)                                                      \
+    {                                                                   \
+        /* note */                                                      \
+        /* could can be more efficient using Vector Scatter Store: */   \
+        /* + pre-increment + WB */                                      \
+        /* To be revisited when intrinsic available */                  \
+        /* SDCOMP-52618 */                                              \
+        yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);        \
+        pY-=4;                                                          \
+        xVec = vldrwq_s32(&pSrcX[1]);                                   \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                            \
+        xVec = vld1q(pSrcX);                                            \
+        pSrcX += 4;                                                     \
+        acc0 = vmlaldavaq(acc0, xVec, yVec);                            \
+        /*  Decrement the loop counter   */                             \
+        k--;                                                            \
+    }                                                                   \
+    k = (count - 1) % 0x4U;                                             \
+    /* use predication to finalize MAC sum */                           \
+    /* acc1 requires exact number of sample (count-1)  */               \
+    /* disable extra lanes in final MAC computation  */                 \
+    mve_pred16_t p0 = vctp32q(k);                                       \
+    yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);            \
+    xVec = vldrwq_s32(&pSrcX[1]);                                       \
+    acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                          \
+    /* acc0 requires 1 additional sample  (count) */                    \
+    /* so add 1 to unmask an extra lane  in final MAC computation  */   \
+    p0 = vctp32q(k+1);                                                  \
+    xVec = vld1q(pSrcX);                                                \
+    pSrcX += 4;                                                         \
+    acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                          \
+                                                                        \
+    acc0 = asrl(acc0, 31);                                              \
+    acc1 = asrl(acc1, 31);                                              \
+}
+
+
+
+#define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q31(acc0, acc1, pX, pY, count)                          \
+{                                                                                                   \
+    q31_t const *pSrcX;                                                                             \
+    q31x4_t   xVec, yVec;                                                                         \
+    uint32_t    k;                                                                                  \
+                                                                                                    \
+    pSrcX = (q31_t const *) pX;                                                                     \
+    k = count >> 2;                                                                                 \
+                                                                                                    \
+    while (k > 0U)                                                                                  \
+    {                                                                                               \
+        /* note */                                                                                  \
+        /* could can be more efficient using Vector Scatter Store: */                               \
+        /* + pre-increment + WB */                                                                  \
+        /* To be revisited when intrinsic available */                                              \
+        /* SDCOMP-52618 */                                                                          \
+        yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);                                    \
+        pY-=4;                                                                                      \
+        xVec = vldrwq_s32(&pSrcX[1]);                                                               \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                        \
+        xVec = vld1q(pSrcX); pSrcX += 4;                                                \
+        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                        \
+        /*  Decrement the loop counter   */                                                         \
+        k--;                                                                                        \
+    }                                                                                               \
+    /* Loop with tail predication expected here  */                                                 \
+    k = count % 0x4U;                                                                               \
+    if (k > 0U)                                                                                     \
+    {                                                                                               \
+        mve_pred16_t p0 = vctp32q(k);                                                    \
+        yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);                                    \
+        xVec = vldrwq_s32(&pSrcX[1]);                                                               \
+        acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                  \
+        xVec = vld1q(pSrcX); pSrcX += 4;                                                \
+        acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                  \
+    }                                                                                               \
+    acc0 = asrl(acc0, 31);                                                                          \
+    acc1 = asrl(acc1, 31);                                                                          \
+}
+
+
+
+#define MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q31(acc0, acc1, acc2, acc3, pX, pY, count)              \
+{                                                                                                   \
+    q31_t const *pSrcX;                                                                             \
+    q31x4_t   xVec, yVec;                                                                         \
+    uint32_t    k;                                                                                  \
+                                                                                                    \
+    pSrcX = (q31_t const *) pX;                                                                     \
+    k = count >> 2;                                                                                 \
+                                                                                                    \
+    while (k > 0U)                                                                                  \
+    {                                                                                               \
+        /* note */                                                                                  \
+        /* could can be more efficient using Vector Scatter Store: */                               \
+        /* + pre-increment + WB */                                                                  \
+        /* To be revisited when intrinsic available */                                              \
+        /* SDCOMP-52618 */                                                                          \
+        yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);                                    \
+        pY-=4;                                                                                      \
+        xVec = vldrwq_s32(&pSrcX[1]);                                                               \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                        \
+        xVec = vldrwq_s32(&pSrcX[2]);                                                               \
+        acc2 = vmlaldavaq(acc2, xVec, yVec);                                                        \
+        xVec = vldrwq_s32(&pSrcX[3]);                                                               \
+        acc3 = vmlaldavaq(acc3, xVec, yVec);                                                        \
+        xVec = vld1q(pSrcX); pSrcX += 4;                                                \
+        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                        \
+        /*  Decrement the loop counter   */                                                         \
+        k--;                                                                                        \
+    }                                                                                               \
+    /* Loop with tail predication expected here  */                                                 \
+    k = count % 0x4U;                                                                               \
+    if (k > 0U)                                                                                     \
+    {                                                                                               \
+        mve_pred16_t p0 = vctp32q(k);                                                    \
+        yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);                                    \
+        xVec = vldrwq_s32(&pSrcX[1]);                                                               \
+        acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                  \
+        xVec = vldrwq_s32(&pSrcX[2]);                                                               \
+        acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0);                                                  \
+        xVec = vldrwq_s32(&pSrcX[3]);                                                               \
+        acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0);                                                  \
+        xVec = vld1q(pSrcX); pSrcX += 4;                                                \
+        acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                  \
+    }                                                                                               \
+    acc0 = asrl(acc0, 31);                                                                          \
+    acc1 = asrl(acc1, 31);                                                                          \
+    acc2 = asrl(acc2, 31);                                                                          \
+    acc3 = asrl(acc3, 31);                                                                          \
+}
+
+#define MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_Q31(acc0, acc1, pX, pY, count)                            \
+{                                                                                                   \
+    q31_t const *pSrcX, *pSrcY;                                                                     \
+    q31x4_t   xVec, yVec;                                                                         \
+    uint32_t    k;                                                                                  \
+                                                                                                    \
+    pSrcX = (q31_t const *) pX;                                                                     \
+    pSrcY  = (q31_t const *) pY;                                                                    \
+    k = count >> 2;                                                                                 \
+                                                                                                    \
+    while (k > 0U)                                                                                  \
+    {                                                                                               \
+        xVec = vld1q(pSrcX); pSrcX += 4;                                                \
+        yVec = vldrwq_s32(&pSrcY[-1]);                                                              \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                        \
+        yVec = vld1q(pSrcY); pSrcY += 4;                                                \
+        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                        \
+        /*  Decrement the loop counter   */                                                         \
+        k--;                                                                                        \
+    }                                                                                               \
+    k = count % 0x4U;                                                                               \
+    /* use predication to finalize MAC sum */                                                       \
+    /* acc1 requires 1 additional sample  */                                                        \
+    /* so add 1 to unmask an extra lane  in final MAC computation  */                               \
+    mve_pred16_t p0 = vctp32q(k+1);                                                      \
+    xVec = vld1q(pSrcX); pSrcX += 4;                                                    \
+    yVec = vldrwq_s32(&pSrcY[-1]);                                                                  \
+    acc1 = vmlaldavaq_p(acc1, xVec, yVec,p0);                                                       \
+    /* acc0 requires exact number of sample  */                                                     \
+    /* disable extra lanes in final MAC computation  */                                             \
+    p0 = vctp32q(k);                                                                     \
+    yVec = vld1q(pSrcY);  pSrcY += 4;                                                   \
+    acc0 = vmlaldavaq_p(acc0, xVec, yVec,p0);                                                       \
+                                                                                                    \
+    acc0 = asrl(acc0, 31);                                                                          \
+    acc1 = asrl(acc1, 31);                                                                          \
+}
+
+#define MVE_INTR_CORR_SINGLE_Q31(acc, pX, pY, count)                                                \
+{                                                                                                   \
+    q31_t const *pSrcX, *pSrcY;                                                                     \
+    q31x4_t   xVec, yVec;                                                                         \
+    uint32_t    k;                                                                                  \
+                                                                                                    \
+    pSrcX = (q31_t const *) pX;                                                                     \
+    pSrcY = (q31_t const *) pY;                                                                     \
+    k = count >> 2;                                                                                 \
+                                                                                                    \
+    while (k > 0U)                                                                                  \
+    {                                                                                               \
+        xVec = vld1q(pSrcX); pSrcX += 4;                                                \
+        yVec = vld1q(pSrcY); pSrcY += 4;                                                \
+        acc = vmlaldavaq(acc, xVec, yVec);                                                          \
+        /*  Decrement the loop counter   */                                                         \
+        k--;                                                                                        \
+    }                                                                                               \
+    /*  tail predication expected here  */                                                          \
+    k = count % 0x4U;                                                                               \
+    if (k > 0U)                                                                                     \
+    {                                                                                               \
+        mve_pred16_t p0 = vctp32q(k);                                                    \
+        xVec = vld1q(pSrcX); pSrcX += 4;                                                \
+        yVec = vld1q(pSrcY); pSrcY += 4;                                                \
+        acc = vmlaldavaq_p(acc, xVec, yVec, p0);                                                    \
+    }                                                                                               \
+    acc = asrl(acc, 31);                                                                            \
+}
+
+#define MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_Q31(acc0, acc1, acc2, acc3, pX, pY, count)              \
+{                                                                                                   \
+    q31_t const *pSrcX, *pSrcY;                                                                     \
+    q31x4_t   xVec, yVec;                                                                         \
+    uint32_t    k;                                                                                  \
+                                                                                                    \
+    pSrcX = (q31_t const *) pX;                                                                     \
+    pSrcY  = (q31_t const *) pY;                                                                    \
+    k = count >> 2;                                                                                 \
+                                                                                                    \
+    while (k > 0U)                                                                                  \
+    {                                                                                               \
+        yVec = vld1q(pSrcY); pSrcY += 4;                                                \
+        xVec = vldrwq_s32(&pSrcX[1]);                                                               \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                        \
+        xVec = vldrwq_s32(&pSrcX[2]);                                                               \
+        acc2 = vmlaldavaq(acc2, xVec, yVec);                                                        \
+        xVec = vldrwq_s32(&pSrcX[3]);                                                               \
+        acc3 = vmlaldavaq(acc3, xVec, yVec);                                                        \
+        xVec = vld1q(pSrcX); pSrcX += 4;                                                \
+        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                        \
+        /*  Decrement the loop counter   */                                                         \
+        k--;                                                                                        \
+    }                                                                                               \
+    /* loop + tail predication expected here  */                                                    \
+    k = count % 0x4U;                                                                               \
+    if (k > 0U)                                                                                     \
+    {                                                                                               \
+        mve_pred16_t p0 = vctp32q(k);                                                    \
+        yVec = vld1q(pSrcY); pSrcY += 4;                                                \
+        xVec = vldrwq_s32(&pSrcX[1]);                                                               \
+        acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                  \
+        xVec = vldrwq_s32(&pSrcX[2]);                                                               \
+        acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0);                                                  \
+        xVec = vldrwq_s32(&pSrcX[3]);                                                               \
+        acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0);                                                  \
+        xVec = vld1q(pSrcX); pSrcX += 4;                                                \
+        acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                  \
+    }                                                                                               \
+                                                                                                    \
+    acc0 = asrl(acc0, 31);                                                                          \
+    acc1 = asrl(acc1, 31);                                                                          \
+    acc2 = asrl(acc2, 31);                                                                          \
+    acc3 = asrl(acc3, 31);                                                                          \
+}
+
+#define MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_Q31(acc0, acc1, pX, pY, count)                          \
+{                                                                                                   \
+    q31_t const *pSrcX, *pSrcY;                                                                     \
+    q31x4_t   xVec, yVec;                                                                         \
+    uint32_t    k;                                                                                  \
+                                                                                                    \
+    pSrcX = (q31_t const *) pX;                                                                     \
+    pSrcY  = (q31_t const *) pY;                                                                    \
+    k = count >> 2;                                                                                 \
+                                                                                                    \
+    while (k > 0U)                                                                                  \
+    {                                                                                               \
+        yVec = vld1q(pSrcY); pSrcY += 4;                                                \
+        xVec = vldrwq_s32(&pSrcX[1]);                                                               \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                        \
+        xVec = vld1q(pSrcX); pSrcX += 4;                                                \
+        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                        \
+        /*  Decrement the loop counter   */                                                         \
+        k--;                                                                                        \
+    }                                                                                               \
+    /* loop + tail predication expected here  */                                                    \
+    k = count % 0x4U;                                                                               \
+    if (k > 0U)                                                                                     \
+    {                                                                                               \
+        mve_pred16_t p0 = vctp32q(k);                                                    \
+        yVec = vld1q(pSrcY); pSrcY += 4;                                                \
+        xVec = vldrwq_s32(&pSrcX[1]);                                                               \
+        acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                  \
+        xVec = vld1q(pSrcX); pSrcX += 4;                                                \
+        acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                  \
+    }                                                                                               \
+                                                                                                    \
+    acc0 = asrl(acc0, 31);                                                                          \
+    acc1 = asrl(acc1, 31);                                                                          \
+}
+
+#define MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_Q31(acc0, acc1, pX, pY, count)                            \
+{                                                                                                   \
+    q31_t const *pSrcX, *pSrcY;                                                                     \
+    q31x4_t   xVec, yVec;                                                                         \
+    uint32_t    k;                                                                                  \
+                                                                                                    \
+    pSrcX = (q31_t const *) pX;                                                                     \
+    pSrcY  = (q31_t const *) pY;                                                                    \
+    k = (count-1) >> 2;                                                                             \
+                                                                                                    \
+    while (k > 0U)                                                                                  \
+    {                                                                                               \
+        yVec = vld1q(pSrcY); pSrcY += 4;                                                \
+        xVec = vldrwq_s32(&pSrcX[1]);                                                               \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                        \
+        xVec = vld1q(pSrcX); pSrcX += 4;                                                \
+        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                        \
+        /*  Decrement the loop counter   */                                                         \
+        k--;                                                                                        \
+    }                                                                                               \
+    /* use predication to finalize MAC sum */                                                       \
+    /* acc1 requires exact number of sample (count-1)  */                                           \
+    /* disable extra lanes in final MAC computation  */                                             \
+    k = (count-1) % 0x4U;                                                                           \
+    mve_pred16_t p0 = vctp32q(k);                                                        \
+    yVec = vld1q(pSrcY);  pSrcY += 4;                                                   \
+    xVec = vldrwq_s32(&pSrcX[1]);                                                                   \
+    acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                      \
+    /* acc0 requires 1 additional sample  (count) */                                                \
+    /* so add 1 to unmask an extra lane  in final MAC computation  */                               \
+    p0 = vctp32q(k+1);                                                                   \
+    xVec = vld1q(pSrcX); pSrcX += 4;                                                    \
+    acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                      \
+                                                                                                    \
+    acc0 = asrl(acc0, 31);                                                                          \
+    acc1 = asrl(acc1, 31);                                                                          \
+}
+
+#define MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_Q15(acc0, acc1, pX, pY, count)                            \
+{                                                                                                   \
+    q15_t const *pSrcX, *pSrcY;                                                                     \
+    q15x8_t   xVec, yVec;                                                                         \
+    uint32_t    k;                                                                                  \
+                                                                                                    \
+    pSrcX = (q15_t const *) pX;                                                                     \
+    pSrcY  = (q15_t const *) pY;                                                                    \
+    k = count >> 3;                                                                                 \
+    while (k > 0U)                                                                                  \
+    {                                                                                               \
+        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
+        yVec = vldrhq_s16(&pSrcY[-1]);                                                              \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                        \
+        yVec = vld1q(pSrcY);  pSrcY += 8;                                               \
+        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                        \
+        /*  Decrement the loop counter   */                                                         \
+        k--;                                                                                        \
+    }                                                                                               \
+    k = count % 0x8U;                                                                               \
+    /* use predication to finalize MAC sum */                                                       \
+    /* acc1 requires 1 additional sample  */                                                        \
+    /* so add 1 to unmask an extra lane  in final MAC computation  */                               \
+    mve_pred16_t p0 = vctp16q(k+1);                                                      \
+    xVec = vld1q(pSrcX);  pSrcX += 8;                                                   \
+    yVec = vldrhq_s16(&pSrcY[-1]);                                                                  \
+    acc1 = vmlaldavaq_p(acc1, xVec, yVec,p0);                                                       \
+    /* acc0 requires exact number of sample  */                                                     \
+    /* disable extra lanes in final MAC computation  */                                             \
+    p0 = vctp16q(k);                                                                     \
+    yVec = vld1q(pSrcY);  pSrcY += 8;                                                   \
+    acc0 = vmlaldavaq_p(acc0, xVec, yVec,p0);                                                       \
+                                                                                                    \
+    acc0 = asrl(acc0, 15);                                                                          \
+    acc1 = asrl(acc1, 15);                                                                          \
+    acc0 = __SSAT(acc0, 16);                                                                        \
+    acc1 = __SSAT(acc1, 16);                                                                        \
+}
+
+#define MVE_INTR_CORR_SINGLE_Q15(acc, pX, pY, count)                                                \
+{                                                                                                   \
+    q15_t const *pSrcX, *pSrcY;                                                                     \
+    q15x8_t   xVec, yVec;                                                                         \
+    uint32_t    k;                                                                                  \
+                                                                                                    \
+    pSrcX = (q15_t const *) pX;                                                                     \
+    pSrcY = (q15_t const *) pY;                                                                     \
+    k = count >> 3;                                                                                 \
+    while (k > 0U)                                                                                  \
+    {                                                                                               \
+        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
+        yVec = vld1q(pSrcY);  pSrcY += 8;                                               \
+        acc = vmlaldavaq(acc, xVec, yVec);                                                          \
+        /*  Decrement the loop counter   */                                                         \
+        k--;                                                                                        \
+    }                                                                                               \
+    /*  tail predication expected here  */                                                          \
+    k = count % 0x8U;                                                                               \
+    if (k > 0U)                                                                                     \
+    {                                                                                               \
+        mve_pred16_t p0 = vctp16q(k);                                                    \
+        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
+        yVec = vld1q(pSrcY);  pSrcY += 8;                                               \
+        acc = vmlaldavaq_p(acc, xVec, yVec, p0);                                                    \
+    }                                                                                               \
+    acc = asrl(acc, 15);                                                                            \
+    acc = __SSAT(acc, 16);                                                                          \
+}
+
+#define MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_Q15(acc0, acc1, acc2, acc3, pX, pY, count)              \
+{                                                                                                   \
+    q15_t const *pSrcX, *pSrcY;                                                                     \
+    q15x8_t   xVec, yVec;                                                                         \
+    uint32_t    k;                                                                                  \
+                                                                                                    \
+    pSrcX = (q15_t const *) pX;                                                                     \
+    pSrcY  = (q15_t const *) pY;                                                                    \
+    k = count >> 3;                                                                                 \
+                                                                                                    \
+    while (k > 0U)                                                                                  \
+    {                                                                                               \
+        yVec = vld1q(pSrcY);  pSrcY += 8;                                               \
+        xVec = vldrhq_s16(&pSrcX[1]);                                                               \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                        \
+        xVec = vldrhq_s16(&pSrcX[2]);                                                               \
+        acc2 = vmlaldavaq(acc2, xVec, yVec);                                                        \
+        xVec = vldrhq_s16(&pSrcX[3]);                                                               \
+        acc3 = vmlaldavaq(acc3, xVec, yVec);                                                        \
+        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
+        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                        \
+        /*  Decrement the loop counter   */                                                         \
+        k--;                                                                                        \
+    }                                                                                               \
+    /* loop + tail predication expected here  */                                                    \
+    k = count % 0x8U;                                                                               \
+    if (k > 0U)                                                                                     \
+    {                                                                                               \
+        mve_pred16_t p0 = vctp16q(k);                                                    \
+        yVec = vld1q(pSrcY);  pSrcY += 8;                                               \
+        xVec = vldrhq_s16(&pSrcX[1]);                                                               \
+        acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                  \
+        xVec = vldrhq_s16(&pSrcX[2]);                                                               \
+        acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0);                                                  \
+        xVec = vldrhq_s16(&pSrcX[3]);                                                               \
+        acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0);                                                  \
+        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
+        acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                  \
+    }                                                                                               \
+                                                                                                    \
+    acc0 = asrl(acc0, 15);                                                                          \
+    acc1 = asrl(acc1, 15);                                                                          \
+    acc2 = asrl(acc2, 15);                                                                          \
+    acc3 = asrl(acc3, 15);                                                                          \
+    acc0 = __SSAT(acc0, 16);                                                                        \
+    acc1 = __SSAT(acc1, 16);                                                                        \
+    acc2 = __SSAT(acc2, 16);                                                                        \
+    acc3 = __SSAT(acc3, 16);                                                                        \
+}
+
+#define MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_Q15(acc0, acc1, pX, pY, count)                          \
+{                                                                                                   \
+    q15_t const *pSrcX, *pSrcY;                                                                     \
+    q15x8_t   xVec, yVec;                                                                         \
+    uint32_t    k;                                                                                  \
+                                                                                                    \
+    pSrcX = (q15_t const *) pX;                                                                     \
+    pSrcY  = (q15_t const *) pY;                                                                    \
+    k = count >> 3;                                                                                 \
+                                                                                                    \
+    while (k > 0U)                                                                                  \
+    {                                                                                               \
+        yVec = vld1q(pSrcY);  pSrcY += 8;                                               \
+        xVec = vldrhq_s16(&pSrcX[1]);                                                               \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                        \
+        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
+        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                        \
+        /*  Decrement the loop counter   */                                                         \
+        k--;                                                                                        \
+    }                                                                                               \
+    /* loop + tail predication expected here  */                                                    \
+    k = count % 0x8U;                                                                               \
+    if (k > 0U)                                                                                     \
+    {                                                                                               \
+        mve_pred16_t p0 = vctp16q(k);                                                    \
+        yVec = vld1q(pSrcY);  pSrcY += 8;                                               \
+        xVec = vldrhq_s16(&pSrcX[1]);                                                               \
+        acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                  \
+        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
+        acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                  \
+    }                                                                                               \
+                                                                                                    \
+    acc0 = asrl(acc0, 15);                                                                          \
+    acc1 = asrl(acc1, 15);                                                                          \
+    acc0 = __SSAT(acc0, 16);                                                                        \
+    acc1 = __SSAT(acc1, 16);                                                                        \
+}
+
+#define MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_Q15(acc0, acc1, pX, pY, count)                            \
+{                                                                                                   \
+    q15_t const *pSrcX, *pSrcY;                                                                     \
+    q15x8_t   xVec, yVec;                                                                         \
+    uint32_t    k;                                                                                  \
+                                                                                                    \
+    pSrcX = (q15_t const *) pX;                                                                     \
+    pSrcY  = (q15_t const *) pY;                                                                    \
+    k = (count-1) >> 3;                                                                             \
+                                                                                                    \
+    while (k > 0U)                                                                                  \
+    {                                                                                               \
+        yVec = vld1q(pSrcY);  pSrcY += 8;                                               \
+        xVec = vldrhq_s16(&pSrcX[1]);                                                               \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                        \
+        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
+        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                        \
+        /*  Decrement the loop counter   */                                                         \
+        k--;                                                                                        \
+    }                                                                                               \
+    /* use predication to finalize MAC sum */                                                       \
+    /* acc1 requires exact number of sample (count-1)  */                                           \
+    /* disable extra lanes in final MAC computation  */                                             \
+    k = (count-1) % 0x8U;                                                                           \
+    mve_pred16_t p0 = vctp16q(k);                                                        \
+    yVec = vld1q(pSrcY);  pSrcY += 8;                                                   \
+    xVec = vldrhq_s16(&pSrcX[1]);                                                                   \
+    acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                      \
+    /* acc0 requires 1 additional sample  (count) */                                                \
+    /* so add 1 to unmask an extra lane  in final MAC computation  */                               \
+    p0 = vctp16q(k+1);                                                                   \
+    xVec = vld1q(pSrcX);  pSrcX += 8;                                                   \
+    acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                      \
+                                                                                                    \
+    acc0 = asrl(acc0, 15);                                                                          \
+    acc1 = asrl(acc1, 15);                                                                          \
+    acc0 = __SSAT(acc0, 16);                                                                        \
+    acc1 = __SSAT(acc1, 16);                                                                        \
+}
+
+#define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q15(acc0, acc1, pX, pY, count)                            \
+{                                                                                                   \
+    q15_t const *pSrcX;                                                                             \
+    const q15_t       *pY1 = pY + 1;                                                                      \
+    q15x8_t   xVec, yVec;                                                                         \
+    uint32_t    k;                                                                                  \
+                                                                                                    \
+    pSrcX = (q15_t const *) pX;                                                                     \
+    k = count >> 3;                                                                                 \
+                                                                                                    \
+    while (k > 0U)                                                                                  \
+    {                                                                                               \
+        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
+        yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                    \
+        pY-=8;                                                                                      \
+        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                        \
+        yVec = vldrhq_gather_shifted_offset_s16(pY1, decrIdxVec);                                   \
+        pY1-=8;                                                                                     \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                        \
+        /*  Decrement the loop counter   */                                                         \
+        k--;                                                                                        \
+    }                                                                                               \
+    k = count % 0x8U;                                                                               \
+    /* use predication to finalize MAC sum */                                                       \
+    /* acc0 requires exact number of sample  */                                                     \
+    /* disable extra lanes in final MAC computation  */                                             \
+    mve_pred16_t p0 = vctp16q(k);                                                        \
+    xVec = vld1q(pSrcX);  pSrcX += 8;                                                   \
+    yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                        \
+    acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                      \
+    yVec = vldrhq_gather_shifted_offset_s16(pY1, decrIdxVec);                                       \
+    /* acc1 requires 1 additional sample  */                                                        \
+    /* so add 1 to unmask an extra lane  in final MAC computation  */                               \
+    p0 = vctp16q(k+1);                                                                   \
+    acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                      \
+                                                                                                    \
+    acc0 = asrl(acc0, 15);                                                                          \
+    acc1 = asrl(acc1, 15);                                                                          \
+    acc0 = __SSAT(acc0, 16);                                                                        \
+    acc1 = __SSAT(acc1, 16);                                                                        \
+}
+
+#define MVE_INTR_CONV_SINGLE_Q15(acc, pX, pY, count)                                                \
+{                                                                                                   \
+    q15_t const *pSrcX;                                                                             \
+    q15x8_t   xVec, yVec;                                                                         \
+    uint32_t    k;                                                                                  \
+                                                                                                    \
+    pSrcX = (q15_t const *) pX;                                                                     \
+    k = count >> 3;                                                                                 \
+                                                                                                    \
+    while (k > 0U)                                                                                  \
+    {                                                                                               \
+        /* note */                                                                                  \
+        /* could can be more efficient using Vector Scatter Store: */                               \
+        /* + pre-increment + WB */                                                                  \
+        /* To be revisited when intrinsic available */                                              \
+        /* SDCOMP-52618 */                                                                          \
+        yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                    \
+        pY-=8;                                                                                      \
+        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
+        acc = vmlaldavaq(acc, xVec, yVec);                                                          \
+        /*  Decrement the loop counter   */                                                         \
+        k--;                                                                                        \
+    }                                                                                               \
+    /* Loop with tail predication expected here  */                                                 \
+    k = count % 0x8U;                                                                               \
+    if (k > 0U)                                                                                     \
+    {                                                                                               \
+        mve_pred16_t p0 = vctp16q(k);                                                    \
+        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
+        yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                    \
+        acc = vmlaldavaq_p(acc, xVec, yVec, p0);                                                    \
+    }                                                                                               \
+    acc = asrl(acc, 15);                                                                            \
+    acc = __SSAT(acc, 16);                                                                          \
+}
+
+#define MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q15(acc0, acc1, acc2, acc3, pX, pY, count)              \
+{                                                                                                   \
+    q15_t const *pSrcX;                                                                             \
+    q15x8_t   xVec, yVec;                                                                         \
+    uint32_t    k;                                                                                  \
+                                                                                                    \
+    pSrcX = (q15_t const *) pX;                                                                     \
+    k = count >> 3;                                                                                 \
+                                                                                                    \
+    while (k > 0U)                                                                                  \
+    {                                                                                               \
+        /* note */                                                                                  \
+        /* could can be more efficient using Vector Scatter Store: */                               \
+        /* + pre-increment + WB */                                                                  \
+        /* To be revisited when intrinsic available */                                              \
+        /* SDCOMP-52618 */                                                                          \
+        yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                    \
+        pY-=8;                                                                                      \
+        xVec = vldrhq_s16(&pSrcX[1]);                                                               \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                        \
+        xVec = vldrhq_s16(&pSrcX[2]);                                                               \
+        acc2 = vmlaldavaq(acc2, xVec, yVec);                                                        \
+        xVec = vldrhq_s16(&pSrcX[3]);                                                               \
+        acc3 = vmlaldavaq(acc3, xVec, yVec);                                                        \
+        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
+        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                        \
+        /*  Decrement the loop counter   */                                                         \
+        k--;                                                                                        \
+    }                                                                                               \
+    /* Loop with tail predication expected here  */                                                 \
+    k = count % 0x8U;                                                                               \
+    if (k > 0U)                                                                                     \
+    {                                                                                               \
+        mve_pred16_t p0 = vctp16q(k);                                                    \
+        yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                    \
+        xVec = vldrhq_s16(&pSrcX[1]);                                                               \
+        acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                  \
+        xVec = vldrhq_s16(&pSrcX[2]);                                                               \
+        acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0);                                                  \
+        xVec = vldrhq_s16(&pSrcX[3]);                                                               \
+        acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0);                                                  \
+        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
+        acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                  \
+    }                                                                                               \
+    acc0 = asrl(acc0, 15);                                                                          \
+    acc1 = asrl(acc1, 15);                                                                          \
+    acc2 = asrl(acc2, 15);                                                                          \
+    acc3 = asrl(acc3, 15);                                                                          \
+    acc0 = __SSAT(acc0, 16);                                                                        \
+    acc1 = __SSAT(acc1, 16);                                                                        \
+    acc2 = __SSAT(acc2, 16);                                                                        \
+    acc3 = __SSAT(acc3, 16);                                                                        \
+}
+
+#define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q15(acc0, acc1, pX, pY, count)                          \
+{                                                                                                   \
+    q15_t const *pSrcX;                                                                             \
+    q15x8_t   xVec, yVec;                                                                         \
+    uint32_t    k;                                                                                  \
+                                                                                                    \
+    pSrcX = (q15_t const *) pX;                                                                     \
+    k = count >> 3;                                                                                 \
+                                                                                                    \
+    while (k > 0U)                                                                                  \
+    {                                                                                               \
+        /* note */                                                                                  \
+        /* could can be more efficient using Vector Scatter Store: */                               \
+        /* + pre-increment + WB */                                                                  \
+        /* To be revisited when intrinsic available */                                              \
+        /* SDCOMP-52618 */                                                                          \
+        yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                    \
+        pY-=8;                                                                                      \
+        xVec = vldrhq_s16(&pSrcX[1]);                                                               \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                        \
+        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
+        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                        \
+        /*  Decrement the loop counter   */                                                         \
+        k--;                                                                                        \
+    }                                                                                               \
+    /* Loop with tail predication expected here  */                                                 \
+    k = count % 0x8U;                                                                               \
+    if (k > 0U)                                                                                     \
+    {                                                                                               \
+        mve_pred16_t p0 = vctp16q(k);                                                    \
+        yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                    \
+        xVec = vldrhq_s16(&pSrcX[1]);                                                               \
+        acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                  \
+        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
+        acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                  \
+    }                                                                                               \
+    acc0 = asrl(acc0, 15);                                                                          \
+    acc1 = asrl(acc1, 15);                                                                          \
+    acc0 = __SSAT(acc0, 16);                                                                        \
+    acc1 = __SSAT(acc1, 16);                                                                        \
+}
+
+#define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q15(acc0, acc1, pX, pY, count)                            \
+{                                                                                                   \
+    q15_t const *pSrcX;                                                                             \
+    q15x8_t   xVec, yVec;                                                                         \
+    uint32_t    k;                                                                                  \
+                                                                                                    \
+    pSrcX = (q15_t const *) pX;                                                                     \
+    k = (count-1) >> 3;                                                                             \
+                                                                                                    \
+    while (k > 0U)                                                                                  \
+    {                                                                                               \
+        /* note */                                                                                  \
+        /* could can be more efficient using Vector Scatter Store: */                               \
+        /* + pre-increment + WB */                                                                  \
+        /* To be revisited when intrinsic available */                                              \
+        /* SDCOMP-52618 */                                                                          \
+        yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                    \
+        pY-=8;                                                                                      \
+        xVec = vldrhq_s16(&pSrcX[1]);                                                               \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                        \
+        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
+        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                        \
+        /*  Decrement the loop counter   */                                                         \
+        k--;                                                                                        \
+    }                                                                                               \
+    k = (count - 1) % 0x8U;                                                                         \
+    /* use predication to finalize MAC sum */                                                       \
+    /* acc1 requires exact number of sample (count-1)  */                                           \
+    /* disable extra lanes in final MAC computation  */                                             \
+    mve_pred16_t p0 = vctp16q(k);                                                        \
+    yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                        \
+    xVec = vldrhq_s16(&pSrcX[1]);                                                                   \
+    acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                      \
+    /* acc0 requires 1 additional sample  (count) */                                                \
+    /* so add 1 to unmask an extra lane  in final MAC computation  */                               \
+    p0 = vctp16q(k+1);                                                                   \
+    xVec = vld1q(pSrcX);  pSrcX += 8;                                                   \
+    acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                      \
+                                                                                                    \
+    acc0 = asrl(acc0, 15);                                                                          \
+    acc1 = asrl(acc1, 15);                                                                          \
+    acc0 = __SSAT(acc0, 16);                                                                        \
+    acc1 = __SSAT(acc1, 16);                                                                        \
+}
+
+#define MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_Q7(acc0, acc1, pX, pY, count)                             \
+{                                                                                                   \
+    q7_t const *pSrcX, *pSrcY;                                                                      \
+    q7x16_t   xVec, yVec;                                                                          \
+    uint32_t    k;                                                                                  \
+                                                                                                    \
+    pSrcX = (q7_t const *) pX;                                                                      \
+    pSrcY = (q7_t const *) pY;                                                                      \
+    k = count >> 4;                                                                                 \
+    while (k > 0U)                                                                                  \
+    {                                                                                               \
+        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
+        yVec = vldrbq_s8(&pSrcY[-1]);                                                               \
+        acc1 = vmladavaq(acc1, xVec, yVec);                                                         \
+        yVec = vld1q(pSrcY);  pSrcY += 16;                                                \
+        acc0 = vmladavaq(acc0, xVec, yVec);                                                         \
+        /*  Decrement the loop counter   */                                                         \
+        k--;                                                                                        \
+    }                                                                                               \
+    k = count % 0x10U;                                                                              \
+    /* use predication to finalize MAC sum */                                                       \
+    /* acc1 requires 1 additional sample  */                                                        \
+    /* so add 1 to unmask an extra lane  in final MAC computation  */                               \
+    mve_pred16_t p0 = vctp8q(k+1);                                                       \
+    xVec = vld1q(pSrcX);  pSrcX += 16;                                                    \
+    yVec = vldrbq_s8(&pSrcY[-1]);                                                                   \
+    acc1 = vmladavaq_p(acc1, xVec, yVec,p0);                                                        \
+    /* acc0 requires exact number of sample  */                                                     \
+    /* disable extra lanes in final MAC computation  */                                             \
+    p0 = vctp8q(k);                                                                      \
+    yVec = vld1q(pSrcY);  pSrcY += 16;                                                    \
+    acc0 = vmladavaq_p(acc0, xVec, yVec,p0);                                                        \
+                                                                                                    \
+    acc0 = (acc0 >> 7);                                                                             \
+    acc1 = (acc1 >> 7);                                                                             \
+    acc0 = __SSAT(acc0, 8);                                                                         \
+    acc1 = __SSAT(acc1, 8);                                                                         \
+}
+
+#define MVE_INTR_CORR_SINGLE_Q7(acc, pX, pY, count)                                                 \
+{                                                                                                   \
+    q7_t const *pSrcX, *pSrcY;                                                                      \
+    q7x16_t   xVec, yVec;                                                                          \
+    uint32_t    k;                                                                                  \
+                                                                                                    \
+    pSrcX = (q7_t const *) pX;                                                                      \
+    pSrcY = (q7_t const *) pY;                                                                      \
+    k = count >> 4;                                                                                 \
+    while (k > 0U)                                                                                  \
+    {                                                                                               \
+        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
+        yVec = vld1q(pSrcY);  pSrcY += 16;                                                \
+        acc = vmladavaq(acc, xVec, yVec);                                                           \
+        /*  Decrement the loop counter   */                                                         \
+        k--;                                                                                        \
+    }                                                                                               \
+    /*  tail predication expected here  */                                                          \
+    k = count % 0x10U;                                                                              \
+    if (k > 0U)                                                                                     \
+    {                                                                                               \
+        mve_pred16_t p0 = vctp8q(k);                                                     \
+        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
+        yVec = vld1q(pSrcY);  pSrcY += 16;                                                \
+        acc = vmladavaq_p(acc, xVec, yVec, p0);                                                     \
+    }                                                                                               \
+    acc =(acc >> 7);                                                                                \
+    acc = __SSAT(acc, 8);                                                                           \
+}
+
+#define MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_Q7(acc0, acc1, acc2, acc3, pX, pY, count)               \
+{                                                                                                   \
+    q7_t const *pSrcX, *pSrcY;                                                                      \
+    q7x16_t   xVec, yVec;                                                                          \
+    uint32_t    k;                                                                                  \
+                                                                                                    \
+    pSrcX = (q7_t const *) pX;                                                                      \
+    pSrcY = (q7_t const *) pY;                                                                      \
+    k = count >> 4;                                                                                 \
+                                                                                                    \
+    while (k > 0U)                                                                                  \
+    {                                                                                               \
+        yVec = vld1q(pSrcY);  pSrcY += 16;                                                \
+        xVec = vldrbq_s8(&pSrcX[1]);                                                                \
+        acc1 = vmladavaq(acc1, xVec, yVec);                                                         \
+        xVec = vldrbq_s8(&pSrcX[2]);                                                                \
+        acc2 = vmladavaq(acc2, xVec, yVec);                                                         \
+        xVec = vldrbq_s8(&pSrcX[3]);                                                                \
+        acc3 = vmladavaq(acc3, xVec, yVec);                                                         \
+        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
+        acc0 = vmladavaq(acc0, xVec, yVec);                                                         \
+        /*  Decrement the loop counter   */                                                         \
+        k--;                                                                                        \
+    }                                                                                               \
+    /* loop + tail predication expected here  */                                                    \
+    k = count % 0x10U;                                                                              \
+    if (k > 0U)                                                                                     \
+    {                                                                                               \
+        mve_pred16_t p0 = vctp8q(k);                                                     \
+        yVec = vld1q(pSrcY);  pSrcY += 16;                                                \
+        xVec = vldrbq_s8(&pSrcX[1]);                                                                \
+        acc1 = vmladavaq_p(acc1, xVec, yVec, p0);                                                   \
+        xVec = vldrbq_s8(&pSrcX[2]);                                                                \
+        acc2 = vmladavaq_p(acc2, xVec, yVec, p0);                                                   \
+        xVec = vldrbq_s8(&pSrcX[3]);                                                                \
+        acc3 = vmladavaq_p(acc3, xVec, yVec, p0);                                                   \
+        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
+        acc0 = vmladavaq_p(acc0, xVec, yVec, p0);                                                   \
+    }                                                                                               \
+                                                                                                    \
+    acc0 = (acc0 >> 7);                                                                             \
+    acc1 = (acc1 >> 7);                                                                             \
+    acc2 = (acc2 >> 7);                                                                             \
+    acc3 = (acc3 >> 7);                                                                             \
+    acc0 = __SSAT(acc0, 8);                                                                         \
+    acc1 = __SSAT(acc1, 8);                                                                         \
+    acc2 = __SSAT(acc2, 8);                                                                         \
+    acc3 = __SSAT(acc3, 8);                                                                         \
+}
+
+#define MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_Q7(acc0, acc1, pX, pY, count)                           \
+{                                                                                                   \
+    q7_t const *pSrcX, *pSrcY;                                                                      \
+    q7x16_t   xVec, yVec;                                                                          \
+    uint32_t    k;                                                                                  \
+                                                                                                    \
+    pSrcX = (q7_t const *) pX;                                                                      \
+    pSrcY = (q7_t const *) pY;                                                                      \
+    k = count >> 4;                                                                                 \
+                                                                                                    \
+    while (k > 0U)                                                                                  \
+    {                                                                                               \
+        yVec = vld1q(pSrcY);  pSrcY += 16;                                                \
+        xVec = vldrbq_s8(&pSrcX[1]);                                                                \
+        acc1 = vmladavaq(acc1, xVec, yVec);                                                         \
+        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
+        acc0 = vmladavaq(acc0, xVec, yVec);                                                         \
+        /*  Decrement the loop counter   */                                                         \
+        k--;                                                                                        \
+    }                                                                                               \
+    /* loop + tail predication expected here  */                                                    \
+    k = count % 0x10U;                                                                              \
+    if (k > 0U)                                                                                     \
+    {                                                                                               \
+        mve_pred16_t p0 = vctp8q(k);                                                     \
+        yVec = vld1q(pSrcY);  pSrcY += 16;                                                \
+        xVec = vldrbq_s8(&pSrcX[1]);                                                                \
+        acc1 = vmladavaq_p(acc1, xVec, yVec, p0);                                                   \
+        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
+        acc0 = vmladavaq_p(acc0, xVec, yVec, p0);                                                   \
+    }                                                                                               \
+                                                                                                    \
+    acc0 = (acc0 >> 7);                                                                             \
+    acc1 = (acc1 >> 7);                                                                             \
+    acc0 = __SSAT(acc0, 8);                                                                         \
+    acc1 = __SSAT(acc1, 8);                                                                         \
+}
+
+#define MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_Q7(acc0, acc1, pX, pY, count)                             \
+{                                                                                                   \
+    q7_t const *pSrcX, *pSrcY;                                                                      \
+    q7x16_t   xVec, yVec;                                                                          \
+    uint32_t    k;                                                                                  \
+                                                                                                    \
+    pSrcX = (q7_t const *) pX;                                                                      \
+    pSrcY = (q7_t const *) pY;                                                                      \
+    k = (count-1) >> 4;                                                                             \
+                                                                                                    \
+    while (k > 0U)                                                                                  \
+    {                                                                                               \
+        yVec = vld1q(pSrcY);  pSrcY += 16;                                                \
+        xVec = vldrbq_s8(&pSrcX[1]);                                                                \
+        acc1 = vmladavaq(acc1, xVec, yVec);                                                         \
+        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
+        acc0 = vmladavaq(acc0, xVec, yVec);                                                         \
+        /*  Decrement the loop counter   */                                                         \
+        k--;                                                                                        \
+    }                                                                                               \
+    /* use predication to finalize MAC sum */                                                       \
+    /* acc1 requires exact number of sample (count-1)  */                                           \
+    /* disable extra lanes in final MAC computation  */                                             \
+    k = (count-1) % 0x10U;                                                                          \
+    mve_pred16_t p0 = vctp8q(k);                                                         \
+    yVec = vld1q(pSrcY);  pSrcY += 16;                                                    \
+    xVec = vldrbq_s8(&pSrcX[1]);                                                                    \
+    acc1 = vmladavaq_p(acc1, xVec, yVec, p0);                                                       \
+    /* acc0 requires 1 additional sample  (count) */                                                \
+    /* so add 1 to unmask an extra lane  in final MAC computation  */                               \
+    p0 = vctp8q(k+1);                                                                    \
+    xVec = vld1q(pSrcX);  pSrcX += 16;                                                    \
+    acc0 = vmladavaq_p(acc0, xVec, yVec, p0);                                                       \
+                                                                                                    \
+    acc0 = (acc0 >> 7);                                                                             \
+    acc1 = (acc1 >> 7);                                                                             \
+    acc0 = __SSAT(acc0, 8);                                                                         \
+    acc1 = __SSAT(acc1, 8);                                                                         \
+}
+
+#define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q7(acc0, acc1, pX, pY, count)                             \
+{                                                                                                   \
+    q7_t const *pSrcX;                                                                              \
+    const q7_t       *pY1 = pY + 1;                                                                       \
+    q7x16_t   xVec, yVec;                                                                          \
+    uint32_t    k;                                                                                  \
+                                                                                                    \
+    pSrcX = (q7_t const *) pX;                                                                      \
+    k = count >> 4;                                                                                 \
+                                                                                                    \
+    while (k > 0U)                                                                                  \
+    {                                                                                               \
+        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
+        yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                             \
+        pY-=16;                                                                                     \
+        acc0 = vmladavaq(acc0, xVec, yVec);                                                         \
+        yVec = vldrbq_gather_offset_s8(pY1, decrIdxVec);                                            \
+        pY1-=16;                                                                                    \
+        acc1 = vmladavaq(acc1, xVec, yVec);                                                         \
+        /*  Decrement the loop counter   */                                                         \
+        k--;                                                                                        \
+    }                                                                                               \
+    k = count % 0x10U;                                                                              \
+    /* use predication to finalize MAC sum */                                                       \
+    /* acc0 requires exact number of sample  */                                                     \
+    /* disable extra lanes in final MAC computation  */                                             \
+    mve_pred16_t p0 = vctp8q(k);                                                         \
+    xVec = vld1q(pSrcX);  pSrcX += 16;                                                    \
+    yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                                 \
+    acc0 = vmladavaq_p(acc0, xVec, yVec, p0);                                                       \
+    yVec = vldrbq_gather_offset_s8(pY1, decrIdxVec);                                                \
+    /* acc1 requires 1 additional sample  */                                                        \
+    /* so add 1 to unmask an extra lane  in final MAC computation  */                               \
+    p0 = vctp8q(k+1);                                                                    \
+    acc1 = vmladavaq_p(acc1, xVec, yVec, p0);                                                       \
+                                                                                                    \
+    acc0 = (acc0 >> 7);                                                                             \
+    acc1 = (acc1 >> 7);                                                                             \
+    acc0 = __SSAT(acc0, 8);                                                                         \
+    acc1 = __SSAT(acc1, 8);                                                                         \
+}
+
+#define MVE_INTR_CONV_SINGLE_Q7(acc, pX, pY, count)                                                 \
+{                                                                                                   \
+    q7_t const *pSrcX;                                                                              \
+    q7x16_t   xVec, yVec;                                                                          \
+    uint32_t    k;                                                                                  \
+                                                                                                    \
+    pSrcX = (q7_t const *) pX;                                                                      \
+    k = count >> 4;                                                                                 \
+                                                                                                    \
+    while (k > 0U)                                                                                  \
+    {                                                                                               \
+        /* note */                                                                                  \
+        /* could can be more efficient using Vector Scatter Store: */                               \
+        /* + pre-increment + WB */                                                                  \
+        /* To be revisited when intrinsic available */                                              \
+        /* SDCOMP-52618 */                                                                          \
+        yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                             \
+        pY-=16;                                                                                     \
+        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
+        acc = vmladavaq(acc, xVec, yVec);                                                           \
+        /*  Decrement the loop counter   */                                                         \
+        k--;                                                                                        \
+    }                                                                                               \
+    /* Loop with tail predication expected here  */                                                 \
+    k = count % 0x10U;                                                                              \
+    if (k > 0U)                                                                                     \
+    {                                                                                               \
+        mve_pred16_t p0 = vctp8q(k);                                                     \
+        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
+        yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                             \
+        acc = vmladavaq_p(acc, xVec, yVec, p0);                                                     \
+    }                                                                                               \
+    acc = __SSAT(acc >> 7, 8);                                                                      \
+}
+
+#define MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q7(acc0, acc1, acc2, acc3, pX, pY, count)               \
+{                                                                                                   \
+    q7_t const *pSrcX;                                                                              \
+    q7x16_t   xVec, yVec;                                                                          \
+    uint32_t    k;                                                                                  \
+                                                                                                    \
+    pSrcX = (q7_t const *) pX;                                                                      \
+    k = count >> 4;                                                                                 \
+                                                                                                    \
+    while (k > 0U)                                                                                  \
+    {                                                                                               \
+        /* note */                                                                                  \
+        /* could can be more efficient using Vector Scatter Store: */                               \
+        /* + pre-increment + WB */                                                                  \
+        /* To be revisited when intrinsic available */                                              \
+        /* SDCOMP-52618 */                                                                          \
+        yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                             \
+        pY-=16;                                                                                     \
+        xVec = vldrbq_s8(&pSrcX[1]);                                                                \
+        acc1 = vmladavaq(acc1, xVec, yVec);                                                         \
+        xVec = vldrbq_s8(&pSrcX[2]);                                                                \
+        acc2 = vmladavaq(acc2, xVec, yVec);                                                         \
+        xVec = vldrbq_s8(&pSrcX[3]);                                                                \
+        acc3 = vmladavaq(acc3, xVec, yVec);                                                         \
+        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
+        acc0 = vmladavaq(acc0, xVec, yVec);                                                         \
+        /*  Decrement the loop counter   */                                                         \
+        k--;                                                                                        \
+    }                                                                                               \
+    /* Loop with tail predication expected here  */                                                 \
+    k = count % 0x10U;                                                                              \
+    if (k > 0U)                                                                                     \
+    {                                                                                               \
+        mve_pred16_t p0 = vctp8q(k);                                                     \
+        yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                             \
+        xVec = vldrbq_s8(&pSrcX[1]);                                                                \
+        acc1 = vmladavaq_p(acc1, xVec, yVec, p0);                                                   \
+        xVec = vldrbq_s8(&pSrcX[2]);                                                                \
+        acc2 = vmladavaq_p(acc2, xVec, yVec, p0);                                                   \
+        xVec = vldrbq_s8(&pSrcX[3]);                                                                \
+        acc3 = vmladavaq_p(acc3, xVec, yVec, p0);                                                   \
+        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
+        acc0 = vmladavaq_p(acc0, xVec, yVec, p0);                                                   \
+    }                                                                                               \
+    acc0 = __SSAT(acc0 >> 7, 8);                                                                    \
+    acc1 = __SSAT(acc1 >> 7, 8);                                                                    \
+    acc2 = __SSAT(acc2 >> 7, 8);                                                                    \
+    acc3 = __SSAT(acc3 >> 7, 8);                                                                    \
+}
+
+#define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q7(acc0, acc1, pX, pY, count)                           \
+{                                                                                                   \
+    q7_t const *pSrcX;                                                                              \
+    q7x16_t   xVec, yVec;                                                                          \
+    uint32_t    k;                                                                                  \
+                                                                                                    \
+    pSrcX = (q7_t const *) pX;                                                                      \
+    k = count >> 4;                                                                                 \
+                                                                                                    \
+    while (k > 0U)                                                                                  \
+    {                                                                                               \
+        /* note */                                                                                  \
+        /* could can be more efficient using Vector Scatter Store: */                               \
+        /* + pre-increment + WB */                                                                  \
+        /* To be revisited when intrinsic available */                                              \
+        /* SDCOMP-52618 */                                                                          \
+        yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                             \
+        pY-=16;                                                                                     \
+        xVec = vldrbq_s8(&pSrcX[1]);                                                                \
+        acc1 = vmladavaq(acc1, xVec, yVec);                                                         \
+        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
+        acc0 = vmladavaq(acc0, xVec, yVec);                                                         \
+        /*  Decrement the loop counter   */                                                         \
+        k--;                                                                                        \
+    }                                                                                               \
+    /* Loop with tail predication expected here  */                                                 \
+    k = count % 0x10U;                                                                              \
+    if (k > 0U)                                                                                     \
+    {                                                                                               \
+        mve_pred16_t p0 = vctp8q(k);                                                     \
+        yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                             \
+        xVec = vldrbq_s8(&pSrcX[1]);                                                                \
+        acc1 = vmladavaq_p(acc1, xVec, yVec, p0);                                                   \
+        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
+        acc0 = vmladavaq_p(acc0, xVec, yVec, p0);                                                   \
+    }                                                                                               \
+    acc0 = __SSAT(acc0 >> 7, 8);                                                                    \
+    acc1 = __SSAT(acc1 >> 7, 8);                                                                    \
+}
+
+
+#define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q7(acc0, acc1, pX, pY, count)                             \
+{                                                                                                   \
+    q7_t const *pSrcX;                                                                              \
+    q7x16_t   xVec, yVec;                                                                          \
+    uint32_t    k;                                                                                  \
+                                                                                                    \
+    pSrcX = (q7_t const *) pX;                                                                      \
+    k = (count-1) >> 4;                                                                             \
+                                                                                                    \
+    while (k > 0U)                                                                                  \
+    {                                                                                               \
+        /* note */                                                                                  \
+        /* could can be more efficient using Vector Scatter Store: */                               \
+        /* + pre-increment + WB */                                                                  \
+        /* To be revisited when intrinsic available */                                              \
+        /* SDCOMP-52618 */                                                                          \
+        yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                             \
+        pY-=16;                                                                                     \
+        xVec = vldrbq_s8(&pSrcX[1]);                                                                \
+        acc1 = vmladavaq(acc1, xVec, yVec);                                                         \
+        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
+        acc0 = vmladavaq(acc0, xVec, yVec);                                                         \
+        /*  Decrement the loop counter   */                                                         \
+        k--;                                                                                        \
+    }                                                                                               \
+    k = (count - 1) % 0x10U;                                                                        \
+    /* use predication to finalize MAC sum */                                                       \
+    /* acc1 requires exact number of sample (count-1)  */                                           \
+    /* disable extra lanes in final MAC computation  */                                             \
+    mve_pred16_t p0 = vctp8q(k);                                                         \
+    yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                                 \
+    xVec = vldrbq_s8(&pSrcX[1]);                                                                    \
+    acc1 = vmladavaq_p(acc1, xVec, yVec, p0);                                                       \
+    /* acc0 requires 1 additional sample  (count) */                                                \
+    /* so add 1 to unmask an extra lane  in final MAC computation  */                               \
+    p0 = vctp8q(k+1);                                                                    \
+    xVec = vld1q(pSrcX);  pSrcX += 16;                                                    \
+    acc0 = vmladavaq_p(acc0, xVec, yVec, p0);                                                       \
+                                                                                                    \
+    acc0 = (acc0 >> 7);                                                                             \
+    acc1 = (acc1 >> 7);                                                                             \
+    acc0 = __SSAT(acc0, 8);                                                                         \
+    acc1 = __SSAT(acc1, 8);                                                                         \
+}
+
+#endif /* (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM)) */
+
+#ifdef   __cplusplus
+}
+#endif
+
+
+#endif /* _ARM_VEC_FILTERING_H_ */

+ 75 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/BasicMathFunctions.c

@@ -0,0 +1,75 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        BasicMathFunctions.c
+ * Description:  Combination of all basic math function source files.
+ *
+ * $Date:        16. March 2020
+ * $Revision:    V1.1.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2019-2020 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_abs_f32.c"
+#include "arm_abs_q15.c"
+#include "arm_abs_q31.c"
+#include "arm_abs_q7.c"
+#include "arm_add_f32.c"
+#include "arm_add_q15.c"
+#include "arm_add_q31.c"
+#include "arm_add_q7.c"
+#include "arm_and_u16.c"
+#include "arm_and_u32.c"
+#include "arm_and_u8.c"
+#include "arm_dot_prod_f32.c"
+#include "arm_dot_prod_q15.c"
+#include "arm_dot_prod_q31.c"
+#include "arm_dot_prod_q7.c"
+#include "arm_mult_f32.c"
+#include "arm_mult_q15.c"
+#include "arm_mult_q31.c"
+#include "arm_mult_q7.c"
+#include "arm_negate_f32.c"
+#include "arm_negate_q15.c"
+#include "arm_negate_q31.c"
+#include "arm_negate_q7.c"
+#include "arm_not_u16.c"
+#include "arm_not_u32.c"
+#include "arm_not_u8.c"
+#include "arm_offset_f32.c"
+#include "arm_offset_q15.c"
+#include "arm_offset_q31.c"
+#include "arm_offset_q7.c"
+#include "arm_or_u16.c"
+#include "arm_or_u32.c"
+#include "arm_or_u8.c"
+#include "arm_scale_f32.c"
+#include "arm_scale_q15.c"
+#include "arm_scale_q31.c"
+#include "arm_scale_q7.c"
+#include "arm_shift_q15.c"
+#include "arm_shift_q31.c"
+#include "arm_shift_q7.c"
+#include "arm_sub_f32.c"
+#include "arm_sub_q15.c"
+#include "arm_sub_q31.c"
+#include "arm_sub_q7.c"
+#include "arm_xor_u16.c"
+#include "arm_xor_u32.c"
+#include "arm_xor_u8.c"

+ 19 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/CMakeLists.txt

@@ -0,0 +1,19 @@
+cmake_minimum_required (VERSION 3.6)
+
+project(CMSISDSPBasicMath)
+
+include(configLib)
+include(configDsp)
+
+file(GLOB SRC "./*_*.c")
+
+add_library(CMSISDSPBasicMath STATIC ${SRC})
+
+configLib(CMSISDSPBasicMath ${ROOT})
+configDsp(CMSISDSPBasicMath ${ROOT})
+
+### Includes
+target_include_directories(CMSISDSPBasicMath PUBLIC "${DSP}/Include")
+
+
+

+ 196 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/arm_abs_f32.c

@@ -0,0 +1,196 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_abs_f32.c
+ * Description:  Floating-point vector absolute value
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+#include <math.h>
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @defgroup BasicAbs Vector Absolute Value
+
+  Computes the absolute value of a vector on an element-by-element basis.
+
+  <pre>
+      pDst[n] = abs(pSrc[n]),   0 <= n < blockSize.
+  </pre>
+
+  The functions support in-place computation allowing the source and
+  destination pointers to reference the same memory buffer.
+  There are separate functions for floating-point, Q7, Q15, and Q31 data types.
+ */
+
+/**
+  @addtogroup BasicAbs
+  @{
+ */
+
+/**
+  @brief         Floating-point vector absolute value.
+  @param[in]     pSrc       points to the input vector
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_abs_f32(
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+    uint32_t blkCnt;                               /* Loop counter */
+    f32x4_t vec1;
+    f32x4_t res;
+
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        /* C = |A| */
+
+        /* Calculate absolute values and then store the results in the destination buffer. */
+        vec1 = vld1q(pSrc);
+        res = vabsq(vec1);
+        vst1q(pDst, res);
+
+        /* Increment pointers */
+        pSrc += 4;
+        pDst += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x3;
+
+
+    if (blkCnt > 0U)
+    {
+      /* C = |A| */
+      mve_pred16_t p0 = vctp32q(blkCnt);
+      vec1 = vld1q(pSrc);
+      vstrwq_p(pDst, vabsq(vec1), p0);
+    }
+
+}
+
+#else
+void arm_abs_f32(
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+    f32x4_t vec1;
+    f32x4_t res;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        /* C = |A| */
+
+    	/* Calculate absolute values and then store the results in the destination buffer. */
+        vec1 = vld1q_f32(pSrc);
+        res = vabsq_f32(vec1);
+        vst1q_f32(pDst, res);
+
+        /* Increment pointers */
+        pSrc += 4;
+        pDst += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x3;
+
+#else
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = |A| */
+
+    /* Calculate absolute and store result in destination buffer. */
+    *pDst++ = fabsf(*pSrc++);
+
+    *pDst++ = fabsf(*pSrc++);
+
+    *pDst++ = fabsf(*pSrc++);
+
+    *pDst++ = fabsf(*pSrc++);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+#endif /* #if defined(ARM_MATH_NEON) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = |A| */
+
+    /* Calculate absolute and store result in destination buffer. */
+    *pDst++ = fabsf(*pSrc++);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+/**
+  @} end of BasicAbs group
+ */

+ 178 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/arm_abs_q15.c

@@ -0,0 +1,178 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_abs_q15.c
+ * Description:  Q15 vector absolute value
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicAbs
+  @{
+ */
+
+/**
+  @brief         Q15 vector absolute value.
+  @param[in]     pSrc       points to the input vector
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   The Q15 value -1 (0x8000) will be saturated to the maximum allowable positive value 0x7FFF.
+ */
+
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_abs_q15(
+    const q15_t * pSrc,
+    q15_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q15x8_t vecSrc;
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = |A|
+         * Calculate absolute and then store the results in the destination buffer.
+         */
+        vecSrc = vld1q(pSrc);
+        vst1q(pDst, vqabsq(vecSrc));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrc += 8;
+        pDst += 8;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 7;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);
+        vecSrc = vld1q(pSrc);
+        vstrhq_p(pDst, vqabsq(vecSrc), p0);
+    }
+}
+
+#else
+void arm_abs_q15(
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+        q15_t in;                                      /* Temporary input variable */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = |A| */
+
+    /* Calculate absolute of input (if -1 then saturated to 0x7fff) and store result in destination buffer. */
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (in > 0) ? in : (q15_t)__QSUB16(0, in);
+#else
+    *pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);
+#endif
+
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (in > 0) ? in : (q15_t)__QSUB16(0, in);
+#else
+    *pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);
+#endif
+
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (in > 0) ? in : (q15_t)__QSUB16(0, in);
+#else
+    *pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);
+#endif
+
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (in > 0) ? in : (q15_t)__QSUB16(0, in);
+#else
+    *pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = |A| */
+
+    /* Calculate absolute of input (if -1 then saturated to 0x7fff) and store result in destination buffer. */
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (in > 0) ? in : (q15_t)__QSUB16(0, in);
+#else
+    *pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicAbs group
+ */

+ 208 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/arm_abs_q31.c

@@ -0,0 +1,208 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_abs_q31.c
+ * Description:  Q31 vector absolute value
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicAbs
+  @{
+ */
+
+/**
+  @brief         Q31 vector absolute value.
+  @param[in]     pSrc       points to the input vector
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   The Q31 value -1 (0x80000000) will be saturated to the maximum allowable positive value 0x7FFFFFFF.
+ */
+
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_abs_q31(
+    const q31_t * pSrc,
+    q31_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* Loop counters */
+    q31x4_t vecSrc;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2;
+
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = |A|
+         * Calculate absolute and then store the results in the destination buffer.
+         */
+        vecSrc = vld1q(pSrc);
+        vst1q(pDst, vqabsq(vecSrc));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * Advance vector source and destination pointers
+         */
+        pSrc += 4;
+        pDst += 4;
+    }
+    /*
+     * Tail
+     */
+    blkCnt = blockSize & 3;
+
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp32q(blkCnt);
+        vecSrc = vld1q(pSrc);
+        vstrwq_p(pDst, vqabsq(vecSrc), p0);
+    }
+}
+
+#else
+void arm_abs_q31(
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+        q31_t in;                                      /* Temporary variable */
+
+#if defined(ARM_MATH_NEON)
+    int32x4_t vec1;
+    int32x4_t res;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        /* C = |A| */
+        /* Calculate absolute and then store the results in the destination buffer. */
+
+        vec1 = vld1q_s32(pSrc);
+        res = vqabsq_s32(vec1);
+        vst1q_s32(pDst, res);
+
+        /* Increment pointers */
+        pSrc += 4;
+        pDst += 4;
+
+        /* Decrement the blockSize loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x3;
+
+#else
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = |A| */
+
+    /* Calculate absolute of input (if -1 then saturated to 0x7fffffff) and store result in destination buffer. */
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (in > 0) ? in : (q31_t)__QSUB(0, in);
+#else
+    *pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
+#endif
+
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (in > 0) ? in : (q31_t)__QSUB(0, in);
+#else
+    *pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
+#endif
+
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (in > 0) ? in : (q31_t)__QSUB(0, in);
+#else
+    *pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
+#endif
+
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (in > 0) ? in : (q31_t)__QSUB(0, in);
+#else
+    *pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+#endif /* #if defined (ARM_MATH_NEON) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = |A| */
+
+    /* Calculate absolute of input (if -1 then saturated to 0x7fffffff) and store result in destination buffer. */
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (in > 0) ? in : (q31_t)__QSUB(0, in);
+#else
+    *pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* #if defined (ARM_MATH_MVEI) */
+/**
+  @} end of BasicAbs group
+ */

+ 180 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/arm_abs_q7.c

@@ -0,0 +1,180 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_abs_q7.c
+ * Description:  Q7 vector absolute value
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicAbs
+  @{
+ */
+
+/**
+  @brief         Q7 vector absolute value.
+  @param[in]     pSrc       points to the input vector
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Conditions for optimum performance
+                   Input and output buffers should be aligned by 32-bit
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   The Q7 value -1 (0x80) will be saturated to the maximum allowable positive value 0x7F.
+ */
+
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_abs_q7(
+    const q7_t * pSrc,
+    q7_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q7x16_t vecSrc;
+
+    /* Compute 16 outputs at a time */
+    blkCnt = blockSize >> 4;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = |A|
+         * Calculate absolute and then store the results in the destination buffer.
+         */
+        vecSrc = vld1q(pSrc);
+        vst1q(pDst, vqabsq(vecSrc));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrc += 16;
+        pDst += 16;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 0xF;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp8q(blkCnt);
+        vecSrc = vld1q(pSrc);
+        vstrbq_p(pDst, vqabsq(vecSrc), p0);
+    }
+}
+
+#else
+void arm_abs_q7(
+  const q7_t * pSrc,
+        q7_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+        q7_t in;                                       /* Temporary input variable */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = |A| */
+
+    /* Calculate absolute of input (if -1 then saturated to 0x7f) and store result in destination buffer. */
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (in > 0) ? in : (q7_t)__QSUB8(0, in);
+#else
+    *pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
+#endif
+
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (in > 0) ? in : (q7_t)__QSUB8(0, in);
+#else
+    *pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
+#endif
+
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (in > 0) ? in : (q7_t)__QSUB8(0, in);
+#else
+    *pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
+#endif
+
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (in > 0) ? in : (q7_t)__QSUB8(0, in);
+#else
+    *pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = |A| */
+
+    /* Calculate absolute of input (if -1 then saturated to 0x7f) and store result in destination buffer. */
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (in > 0) ? in : (q7_t) __QSUB8(0, in);
+#else
+    *pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicAbs group
+ */

+ 199 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/arm_add_f32.c

@@ -0,0 +1,199 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_add_f32.c
+ * Description:  Floating-point vector addition
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @defgroup BasicAdd Vector Addition
+
+  Element-by-element addition of two vectors.
+
+  <pre>
+      pDst[n] = pSrcA[n] + pSrcB[n],   0 <= n < blockSize.
+  </pre>
+
+  There are separate functions for floating-point, Q7, Q15, and Q31 data types.
+ */
+
+/**
+  @addtogroup BasicAdd
+  @{
+ */
+
+/**
+  @brief         Floating-point vector addition.
+  @param[in]     pSrcA      points to first input vector
+  @param[in]     pSrcB      points to second input vector
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_add_f32(
+  const float32_t * pSrcA,
+  const float32_t * pSrcB,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+    uint32_t blkCnt;                               /* Loop counter */
+
+    f32x4_t vec1;
+    f32x4_t vec2;
+    f32x4_t res;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        /* C = A + B */
+
+        /* Add and then store the results in the destination buffer. */
+        vec1 = vld1q(pSrcA);
+        vec2 = vld1q(pSrcB);
+        res = vaddq(vec1, vec2);
+        vst1q(pDst, res);
+
+        /* Increment pointers */
+        pSrcA += 4;
+        pSrcB += 4;
+        pDst += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x3;
+
+    if (blkCnt > 0U)
+    {
+      /* C = A + B */
+      mve_pred16_t p0 = vctp32q(blkCnt);
+      vec1 = vld1q(pSrcA);
+      vec2 = vld1q(pSrcB);
+      vstrwq_p(pDst, vaddq(vec1,vec2), p0);
+    }
+
+}
+
+#else
+void arm_add_f32(
+  const float32_t * pSrcA,
+  const float32_t * pSrcB,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+    f32x4_t vec1;
+    f32x4_t vec2;
+    f32x4_t res;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        /* C = A + B */
+
+    	/* Add and then store the results in the destination buffer. */
+        vec1 = vld1q_f32(pSrcA);
+        vec2 = vld1q_f32(pSrcB);
+        res = vaddq_f32(vec1, vec2);
+        vst1q_f32(pDst, res);
+
+        /* Increment pointers */
+        pSrcA += 4;
+        pSrcB += 4;
+        pDst += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x3;
+
+#else
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A + B */
+
+    /* Add and store result in destination buffer. */
+    *pDst++ = (*pSrcA++) + (*pSrcB++);
+    *pDst++ = (*pSrcA++) + (*pSrcB++);
+    *pDst++ = (*pSrcA++) + (*pSrcB++);
+    *pDst++ = (*pSrcA++) + (*pSrcB++);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+#endif /* #if defined(ARM_MATH_NEON) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A + B */
+
+    /* Add and store result in destination buffer. */
+    *pDst++ = (*pSrcA++) + (*pSrcB++);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of BasicAdd group
+ */

+ 176 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/arm_add_q15.c

@@ -0,0 +1,176 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_add_q15.c
+ * Description:  Q15 vector addition
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicAdd
+  @{
+ */
+
+/**
+  @brief         Q15 vector addition.
+  @param[in]     pSrcA      points to the first input vector
+  @param[in]     pSrcB      points to the second input vector
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
+ */
+
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_add_q15(
+    const q15_t * pSrcA,
+    const q15_t * pSrcB,
+    q15_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q15x8_t vecA;
+    q15x8_t vecB;
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A + B
+         * Add and then store the results in the destination buffer.
+         */
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vst1q(pDst, vqaddq(vecA, vecB));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrcA  += 8;
+        pSrcB  += 8;
+        pDst   += 8;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 7;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vstrhq_p(pDst, vqaddq(vecA, vecB), p0);
+    }
+}
+
+#else
+void arm_add_q15(
+  const q15_t * pSrcA,
+  const q15_t * pSrcB,
+        q15_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+#if defined (ARM_MATH_DSP)
+  q31_t inA1, inA2;
+  q31_t inB1, inB2;
+#endif
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A + B */
+
+#if defined (ARM_MATH_DSP)
+    /* read 2 times 2 samples at a time from sourceA */
+    inA1 = read_q15x2_ia ((q15_t **) &pSrcA);
+    inA2 = read_q15x2_ia ((q15_t **) &pSrcA);
+    /* read 2 times 2 samples at a time from sourceB */
+    inB1 = read_q15x2_ia ((q15_t **) &pSrcB);
+    inB2 = read_q15x2_ia ((q15_t **) &pSrcB);
+
+    /* Add and store 2 times 2 samples at a time */
+    write_q15x2_ia (&pDst, __QADD16(inA1, inB1));
+    write_q15x2_ia (&pDst, __QADD16(inA2, inB2));
+#else
+    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ + *pSrcB++), 16);
+    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ + *pSrcB++), 16);
+    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ + *pSrcB++), 16);
+    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ + *pSrcB++), 16);
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A + B */
+
+    /* Add and store result in destination buffer. */
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (q15_t) __QADD16(*pSrcA++, *pSrcB++);
+#else
+    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ + *pSrcB++), 16);
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+/**
+  @} end of BasicAdd group
+ */

+ 159 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/arm_add_q31.c

@@ -0,0 +1,159 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_add_q31.c
+ * Description:  Q31 vector addition
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicAdd
+  @{
+ */
+
+/**
+  @brief         Q31 vector addition.
+  @param[in]     pSrcA      points to the first input vector
+  @param[in]     pSrcB      points to the second input vector
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
+ */
+
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_add_q31(
+  const q31_t * pSrcA,
+  const q31_t * pSrcB,
+        q31_t * pDst,
+        uint32_t blockSize)
+{
+    uint32_t blkCnt;
+    q31x4_t vecA;
+    q31x4_t vecB;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A + B
+         * Add and then store the results in the destination buffer.
+         */
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vst1q(pDst, vqaddq(vecA, vecB));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrcA  += 4;
+        pSrcB  += 4;
+        pDst   += 4;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 3;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp32q(blkCnt);
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vstrwq_p(pDst, vqaddq(vecA, vecB), p0);
+    }
+}
+
+#else
+void arm_add_q31(
+  const q31_t * pSrcA,
+  const q31_t * pSrcB,
+        q31_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A + B */
+
+    /* Add and store result in destination buffer. */
+    *pDst++ = __QADD(*pSrcA++, *pSrcB++);
+
+    *pDst++ = __QADD(*pSrcA++, *pSrcB++);
+
+    *pDst++ = __QADD(*pSrcA++, *pSrcB++);
+
+    *pDst++ = __QADD(*pSrcA++, *pSrcB++);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A + B */
+
+    /* Add and store result in destination buffer. */
+    *pDst++ = __QADD(*pSrcA++, *pSrcB++);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+
+#endif /* defined(ARM_MATH_MVEI) */
+/**
+  @} end of BasicAdd group
+ */

+ 158 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/arm_add_q7.c

@@ -0,0 +1,158 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_add_q7.c
+ * Description:  Q7 vector addition
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicAdd
+  @{
+ */
+
+/**
+  @brief         Q7 vector addition.
+  @param[in]     pSrcA      points to the first input vector
+  @param[in]     pSrcB      points to the second input vector
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
+ */
+
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_add_q7(
+    const q7_t * pSrcA,
+    const q7_t * pSrcB,
+    q7_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q7x16_t vecA;
+    q7x16_t vecB;
+
+    /* Compute 16 outputs at a time */
+    blkCnt = blockSize >> 4;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A + B
+         * Add and then store the results in the destination buffer.
+         */
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vst1q(pDst, vqaddq(vecA, vecB));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrcA  += 16;
+        pSrcB  += 16;
+        pDst   += 16;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 0xF;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp8q(blkCnt);
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vstrbq_p(pDst, vqaddq(vecA, vecB), p0);
+    }
+}
+#else
+void arm_add_q7(
+  const q7_t * pSrcA,
+  const q7_t * pSrcB,
+        q7_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A + B */
+
+#if defined (ARM_MATH_DSP)
+    /* Add and store result in destination buffer (4 samples at a time). */
+    write_q7x4_ia (&pDst, __QADD8 (read_q7x4_ia ((q7_t **) &pSrcA), read_q7x4_ia ((q7_t **) &pSrcB)));
+#else
+    *pDst++ = (q7_t) __SSAT ((q15_t) *pSrcA++ + *pSrcB++, 8);
+    *pDst++ = (q7_t) __SSAT ((q15_t) *pSrcA++ + *pSrcB++, 8);
+    *pDst++ = (q7_t) __SSAT ((q15_t) *pSrcA++ + *pSrcB++, 8);
+    *pDst++ = (q7_t) __SSAT ((q15_t) *pSrcA++ + *pSrcB++, 8);
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A + B */
+
+    /* Add and store result in destination buffer. */
+    *pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ + *pSrcB++, 8);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+/**
+  @} end of BasicAdd group
+ */

+ 137 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/arm_and_u16.c

@@ -0,0 +1,137 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_and_u16.c
+ * Description:  uint16_t bitwise AND
+ *
+ * $Date:        14 November 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @defgroup And Vector bitwise AND
+
+  Compute the logical bitwise AND.
+
+  There are separate functions for uint32_t, uint16_t, and uint7_t data types.
+ */
+
+/**
+  @addtogroup And
+  @{
+ */
+
+/**
+  @brief         Compute the logical bitwise AND of two fixed-point vectors.
+  @param[in]     pSrcA      points to input vector A
+  @param[in]     pSrcB      points to input vector B
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+void arm_and_u16(
+    const uint16_t * pSrcA,
+    const uint16_t * pSrcB,
+          uint16_t * pDst,
+          uint32_t blockSize)
+{
+    uint32_t blkCnt;      /* Loop counter */
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+    q15x8_t vecSrcA, vecSrcB;
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 3;
+
+    while (blkCnt > 0U)
+    {
+        vecSrcA = vld1q(pSrcA);
+        vecSrcB = vld1q(pSrcB);
+
+        vst1q(pDst, vandq_u16(vecSrcA, vecSrcB) );
+
+        pSrcA += 8;
+        pSrcB += 8;
+        pDst  += 8;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 7;
+
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);
+        vecSrcA = vld1q(pSrcA);
+        vecSrcB = vld1q(pSrcB);
+        vstrhq_p(pDst, vandq_u16(vecSrcA, vecSrcB), p0);
+    }
+#else
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+    uint16x8_t vecA, vecB;
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 3U;
+
+    while (blkCnt > 0U)
+    {
+        vecA = vld1q_u16(pSrcA);
+        vecB = vld1q_u16(pSrcB);
+
+        vst1q_u16(pDst, vandq_u16(vecA, vecB) );
+
+        pSrcA += 8;
+        pSrcB += 8;
+        pDst  += 8;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 7;
+#else
+    /* Initialize blkCnt with number of samples */
+    blkCnt = blockSize;
+#endif
+
+    while (blkCnt > 0U)
+    {
+        *pDst++ = (*pSrcA++)&(*pSrcB++);
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+#endif /* if defined(ARM_MATH_MVEI) */
+}
+
+/**
+  @} end of And group
+ */

+ 129 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/arm_and_u32.c

@@ -0,0 +1,129 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_and_u32.c
+ * Description:  uint32_t bitwise AND
+ *
+ * $Date:        14 November 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup And
+  @{
+ */
+
+/**
+  @brief         Compute the logical bitwise AND of two fixed-point vectors.
+  @param[in]     pSrcA      points to input vector A
+  @param[in]     pSrcB      points to input vector B
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+void arm_and_u32(
+    const uint32_t * pSrcA,
+    const uint32_t * pSrcB,
+          uint32_t * pDst,
+          uint32_t blockSize)
+{
+    uint32_t blkCnt;      /* Loop counter */
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+    q31x4_t vecSrcA, vecSrcB;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2;
+
+    while (blkCnt > 0U)
+    {
+        vecSrcA = vld1q(pSrcA);
+        vecSrcB = vld1q(pSrcB);
+
+        vst1q(pDst, vandq_u32(vecSrcA, vecSrcB) );
+
+        pSrcA += 4;
+        pSrcB += 4;
+        pDst  += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 3;
+
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp32q(blkCnt);
+        vecSrcA = vld1q(pSrcA);
+        vecSrcB = vld1q(pSrcB);
+        vstrwq_p(pDst, vandq_u32(vecSrcA, vecSrcB), p0);
+    }
+#else
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+    uint32x4_t vecA, vecB;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        vecA = vld1q_u32(pSrcA);
+        vecB = vld1q_u32(pSrcB);
+
+        vst1q_u32(pDst, vandq_u32(vecA, vecB) );
+
+        pSrcA += 4;
+        pSrcB += 4;
+        pDst  += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 3;
+#else
+    /* Initialize blkCnt with number of samples */
+    blkCnt = blockSize;
+#endif
+
+    while (blkCnt > 0U)
+    {
+        *pDst++ = (*pSrcA++)&(*pSrcB++);
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+#endif /* if defined(ARM_MATH_MVEI) */
+}
+
+/**
+  @} end of And group
+ */

+ 130 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/arm_and_u8.c

@@ -0,0 +1,130 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_and_u8.c
+ * Description:  uint8_t bitwise AND
+ *
+ * $Date:        14 November 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+
+/**
+  @addtogroup And
+  @{
+ */
+
+/**
+  @brief         Compute the logical bitwise AND of two fixed-point vectors.
+  @param[in]     pSrcA      points to input vector A
+  @param[in]     pSrcB      points to input vector B
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+void arm_and_u8(
+    const uint8_t * pSrcA,
+    const uint8_t * pSrcB,
+          uint8_t * pDst,
+          uint32_t blockSize)
+{
+    uint32_t blkCnt;      /* Loop counter */
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+    q7x16_t vecSrcA, vecSrcB;
+
+    /* Compute 16 outputs at a time */
+    blkCnt = blockSize >> 4;
+
+    while (blkCnt > 0U)
+    {
+        vecSrcA = vld1q(pSrcA);
+        vecSrcB = vld1q(pSrcB);
+
+        vst1q(pDst, vandq_u8(vecSrcA, vecSrcB) );
+
+        pSrcA += 16;
+        pSrcB += 16;
+        pDst  += 16;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0xF;
+
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp8q(blkCnt);
+        vecSrcA = vld1q(pSrcA);
+        vecSrcB = vld1q(pSrcB);
+        vstrbq_p(pDst, vandq_u8(vecSrcA, vecSrcB), p0);
+    }
+#else
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+    uint8x16_t vecA, vecB;
+
+    /* Compute 16 outputs at a time */
+    blkCnt = blockSize >> 4U;
+
+    while (blkCnt > 0U)
+    {
+        vecA = vld1q_u8(pSrcA);
+        vecB = vld1q_u8(pSrcB);
+
+        vst1q_u8(pDst, vandq_u8(vecA, vecB) );
+
+        pSrcA += 16;
+        pSrcB += 16;
+        pDst  += 16;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0xF;
+#else
+    /* Initialize blkCnt with number of samples */
+    blkCnt = blockSize;
+#endif
+
+    while (blkCnt > 0U)
+    {
+        *pDst++ = (*pSrcA++)&(*pSrcB++);
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+#endif /* if defined(ARM_MATH_MVEI) */
+}
+
+/**
+  @} end of And group
+ */

+ 226 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/arm_dot_prod_f32.c

@@ -0,0 +1,226 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_dot_prod_f32.c
+ * Description:  Floating-point dot product
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @defgroup BasicDotProd Vector Dot Product
+
+  Computes the dot product of two vectors.
+  The vectors are multiplied element-by-element and then summed.
+
+  <pre>
+      sum = pSrcA[0]*pSrcB[0] + pSrcA[1]*pSrcB[1] + ... + pSrcA[blockSize-1]*pSrcB[blockSize-1]
+  </pre>
+
+  There are separate functions for floating-point, Q7, Q15, and Q31 data types.
+ */
+
+/**
+  @addtogroup BasicDotProd
+  @{
+ */
+
+/**
+  @brief         Dot product of floating-point vectors.
+  @param[in]     pSrcA      points to the first input vector.
+  @param[in]     pSrcB      points to the second input vector.
+  @param[in]     blockSize  number of samples in each vector.
+  @param[out]    result     output result returned here.
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+
+void arm_dot_prod_f32(
+    const float32_t * pSrcA,
+    const float32_t * pSrcB,
+    uint32_t    blockSize,
+    float32_t * result)
+{
+    f32x4_t vecA, vecB;
+    f32x4_t vecSum;
+    uint32_t blkCnt;
+    float32_t sum = 0.0f;
+    vecSum = vdupq_n_f32(0.0f);
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1]
+         * Calculate dot product and then store the result in a temporary buffer.
+         * and advance vector source and destination pointers
+         */
+        vecA = vld1q(pSrcA);
+        pSrcA += 4;
+
+        vecB = vld1q(pSrcB);
+        pSrcB += 4;
+
+        vecSum = vfmaq(vecSum, vecA, vecB);
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt --;
+    }
+
+
+    blkCnt = blockSize & 3;
+    if (blkCnt > 0U)
+    {
+        /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
+
+        mve_pred16_t p0 = vctp32q(blkCnt);
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vecSum = vfmaq_m(vecSum, vecA, vecB, p0);
+    }
+
+    sum = vecAddAcrossF32Mve(vecSum);
+
+    /* Store result in destination buffer */
+    *result = sum;
+
+}
+
+#else
+
+void arm_dot_prod_f32(
+  const float32_t * pSrcA,
+  const float32_t * pSrcB,
+        uint32_t blockSize,
+        float32_t * result)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+        float32_t sum = 0.0f;                          /* Temporary return variable */
+
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+    f32x4_t vec1;
+    f32x4_t vec2;
+    f32x4_t accum = vdupq_n_f32(0);
+    f32x2_t tmp = vdup_n_f32(0);
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+
+    vec1 = vld1q_f32(pSrcA);
+    vec2 = vld1q_f32(pSrcB);
+
+    while (blkCnt > 0U)
+    {
+        /* C = A[0]*B[0] + A[1]*B[1] + A[2]*B[2] + ... + A[blockSize-1]*B[blockSize-1] */
+        /* Calculate dot product and then store the result in a temporary buffer. */
+
+	      accum = vmlaq_f32(accum, vec1, vec2);
+
+        /* Increment pointers */
+        pSrcA += 4;
+        pSrcB += 4;
+
+        vec1 = vld1q_f32(pSrcA);
+        vec2 = vld1q_f32(pSrcB);
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+#if __aarch64__
+    sum = vpadds_f32(vpadd_f32(vget_low_f32(accum), vget_high_f32(accum)));
+#else
+    tmp = vpadd_f32(vget_low_f32(accum), vget_high_f32(accum));
+    sum = vget_lane_f32(tmp, 0) + vget_lane_f32(tmp, 1);
+
+#endif
+
+    /* Tail */
+    blkCnt = blockSize & 0x3;
+
+#else
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+   ** a second loop below computes the remaining 1 to 3 samples. */
+  while (blkCnt > 0U)
+  {
+    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
+
+    /* Calculate dot product and store result in a temporary buffer. */
+    sum += (*pSrcA++) * (*pSrcB++);
+
+    sum += (*pSrcA++) * (*pSrcB++);
+
+    sum += (*pSrcA++) * (*pSrcB++);
+
+    sum += (*pSrcA++) * (*pSrcB++);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+#endif /* #if defined(ARM_MATH_NEON) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
+
+    /* Calculate dot product and store result in a temporary buffer. */
+    sum += (*pSrcA++) * (*pSrcB++);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Store result in destination buffer */
+  *result = sum;
+}
+
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+/**
+  @} end of BasicDotProd group
+ */

+ 172 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/arm_dot_prod_q15.c

@@ -0,0 +1,172 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_dot_prod_q15.c
+ * Description:  Q15 dot product
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicDotProd
+  @{
+ */
+
+/**
+  @brief         Dot product of Q15 vectors.
+  @param[in]     pSrcA      points to the first input vector
+  @param[in]     pSrcB      points to the second input vector
+  @param[in]     blockSize  number of samples in each vector
+  @param[out]    result     output result returned here
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The intermediate multiplications are in 1.15 x 1.15 = 2.30 format and these
+                   results are added to a 64-bit accumulator in 34.30 format.
+                   Nonsaturating additions are used and given that there are 33 guard bits in the accumulator
+                   there is no risk of overflow.
+                   The return result is in 34.30 format.
+ */
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_dot_prod_q15(
+    const q15_t * pSrcA,
+    const q15_t * pSrcB,
+    uint32_t blockSize,
+    q63_t * result)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q15x8_t vecA;
+    q15x8_t vecB;
+    q63_t     sum = 0LL;
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1]
+         * Calculate dot product and then store the result in a temporary buffer.
+         */
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        sum = vmlaldavaq(sum, vecA, vecB);
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrcA += 8;
+        pSrcB += 8;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 7;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        sum = vmlaldavaq_p(sum, vecA, vecB, p0);
+    }
+
+    *result = sum;
+}
+
+#else
+void arm_dot_prod_q15(
+  const q15_t * pSrcA,
+  const q15_t * pSrcB,
+        uint32_t blockSize,
+        q63_t * result)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+        q63_t sum = 0;                                 /* Temporary return variable */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
+
+#if defined (ARM_MATH_DSP)
+    /* Calculate dot product and store result in a temporary buffer. */
+    sum = __SMLALD(read_q15x2_ia ((q15_t **) &pSrcA), read_q15x2_ia ((q15_t **) &pSrcB), sum);
+    sum = __SMLALD(read_q15x2_ia ((q15_t **) &pSrcA), read_q15x2_ia ((q15_t **) &pSrcB), sum);
+#else
+    sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
+    sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
+    sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
+    sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
+
+    /* Calculate dot product and store result in a temporary buffer. */
+//#if defined (ARM_MATH_DSP)
+//    sum  = __SMLALD(*pSrcA++, *pSrcB++, sum);
+//#else
+    sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
+//#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Store result in destination buffer in 34.30 format */
+  *result = sum;
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicDotProd group
+ */

+ 174 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/arm_dot_prod_q31.c

@@ -0,0 +1,174 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_dot_prod_q31.c
+ * Description:  Q31 dot product
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicDotProd
+  @{
+ */
+
+/**
+  @brief         Dot product of Q31 vectors.
+  @param[in]     pSrcA      points to the first input vector.
+  @param[in]     pSrcB      points to the second input vector.
+  @param[in]     blockSize  number of samples in each vector.
+  @param[out]    result     output result returned here.
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The intermediate multiplications are in 1.31 x 1.31 = 2.62 format and these
+                   are truncated to 2.48 format by discarding the lower 14 bits.
+                   The 2.48 result is then added without saturation to a 64-bit accumulator in 16.48 format.
+                   There are 15 guard bits in the accumulator and there is no risk of overflow as long as
+                   the length of the vectors is less than 2^16 elements.
+                   The return result is in 16.48 format.
+ */
+
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_dot_prod_q31(
+    const q31_t * pSrcA,
+    const q31_t * pSrcB,
+    uint32_t blockSize,
+    q63_t * result)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q31x4_t vecA;
+    q31x4_t vecB;
+    q63_t     sum = 0LL;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1]
+         * Calculate dot product and then store the result in a temporary buffer.
+         */
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        sum = vrmlaldavhaq(sum, vecA, vecB);
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrcA += 4;
+        pSrcB += 4;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 3;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp32q(blkCnt);
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        sum = vrmlaldavhaq_p(sum, vecA, vecB, p0);
+    }
+
+    /*
+     * vrmlaldavhaq provides extra intermediate accumulator headroom.
+     * limiting the need of intermediate scaling
+     * Scalar variant uses 2.48 accu format by right shifting accumulators by 14.
+     * 16.48 output conversion is performed outside the loop by scaling accu. by 6
+     */
+    *result = asrl(sum, (14 - 8));
+}
+
+#else
+void arm_dot_prod_q31(
+  const q31_t * pSrcA,
+  const q31_t * pSrcB,
+        uint32_t blockSize,
+        q63_t * result)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+        q63_t sum = 0;                                 /* Temporary return variable */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
+
+    /* Calculate dot product and store result in a temporary buffer. */
+    sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
+
+    sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
+
+    sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
+
+    sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
+
+    /* Calculate dot product and store result in a temporary buffer. */
+    sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Store result in destination buffer in 16.48 format */
+  *result = sum;
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicDotProd group
+ */

+ 191 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/arm_dot_prod_q7.c

@@ -0,0 +1,191 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_dot_prod_q7.c
+ * Description:  Q7 dot product
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicDotProd
+  @{
+ */
+
+/**
+  @brief         Dot product of Q7 vectors.
+  @param[in]     pSrcA      points to the first input vector
+  @param[in]     pSrcB      points to the second input vector
+  @param[in]     blockSize  number of samples in each vector
+  @param[out]    result     output result returned here
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The intermediate multiplications are in 1.7 x 1.7 = 2.14 format and these
+                   results are added to an accumulator in 18.14 format.
+                   Nonsaturating additions are used and there is no danger of wrap around as long as
+                   the vectors are less than 2^18 elements long.
+                   The return result is in 18.14 format.
+ */
+
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_dot_prod_q7(
+    const q7_t * pSrcA,
+    const q7_t * pSrcB,
+    uint32_t blockSize,
+    q31_t * result)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q7x16_t vecA;
+    q7x16_t vecB;
+    q31_t     sum = 0;
+
+    /* Compute 16 outputs at a time */
+    blkCnt = blockSize >> 4;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1]
+         * Calculate dot product and then store the result in a temporary buffer.
+         */
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        sum = vmladavaq(sum, vecA, vecB);
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrcA += 16;
+        pSrcB += 16;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 0xF;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp8q(blkCnt);
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        sum = vmladavaq_p(sum, vecA, vecB, p0);
+    }
+
+    *result = sum;
+}
+#else
+void arm_dot_prod_q7(
+  const q7_t * pSrcA,
+  const q7_t * pSrcB,
+        uint32_t blockSize,
+        q31_t * result)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+        q31_t sum = 0;                                 /* Temporary return variable */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+#if defined (ARM_MATH_DSP)
+  q31_t input1, input2;                          /* Temporary variables */
+  q31_t inA1, inA2, inB1, inB2;                  /* Temporary variables */
+#endif
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
+
+#if defined (ARM_MATH_DSP)
+    /* read 4 samples at a time from sourceA */
+    input1 = read_q7x4_ia ((q7_t **) &pSrcA);
+    /* read 4 samples at a time from sourceB */
+    input2 = read_q7x4_ia ((q7_t **) &pSrcB);
+
+    /* extract two q7_t samples to q15_t samples */
+    inA1 = __SXTB16(__ROR(input1, 8));
+    /* extract reminaing two samples */
+    inA2 = __SXTB16(input1);
+    /* extract two q7_t samples to q15_t samples */
+    inB1 = __SXTB16(__ROR(input2, 8));
+    /* extract reminaing two samples */
+    inB2 = __SXTB16(input2);
+
+    /* multiply and accumulate two samples at a time */
+    sum = __SMLAD(inA1, inB1, sum);
+    sum = __SMLAD(inA2, inB2, sum);
+#else
+    sum += (q31_t) ((q15_t) *pSrcA++ * *pSrcB++);
+    sum += (q31_t) ((q15_t) *pSrcA++ * *pSrcB++);
+    sum += (q31_t) ((q15_t) *pSrcA++ * *pSrcB++);
+    sum += (q31_t) ((q15_t) *pSrcA++ * *pSrcB++);
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
+
+    /* Calculate dot product and store result in a temporary buffer. */
+//#if defined (ARM_MATH_DSP)
+//    sum  = __SMLAD(*pSrcA++, *pSrcB++, sum);
+//#else
+    sum += (q31_t) ((q15_t) *pSrcA++ * *pSrcB++);
+//#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Store result in destination buffer in 18.14 format */
+  *result = sum;
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicDotProd group
+ */

+ 200 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/arm_mult_f32.c

@@ -0,0 +1,200 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mult_f32.c
+ * Description:  Floating-point vector multiplication
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @defgroup BasicMult Vector Multiplication
+
+  Element-by-element multiplication of two vectors.
+
+  <pre>
+      pDst[n] = pSrcA[n] * pSrcB[n],   0 <= n < blockSize.
+  </pre>
+
+  There are separate functions for floating-point, Q7, Q15, and Q31 data types.
+ */
+
+/**
+  @addtogroup BasicMult
+  @{
+ */
+
+/**
+  @brief         Floating-point vector multiplication.
+  @param[in]     pSrcA      points to the first input vector.
+  @param[in]     pSrcB      points to the second input vector.
+  @param[out]    pDst       points to the output vector.
+  @param[in]     blockSize  number of samples in each vector.
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_mult_f32(
+  const float32_t * pSrcA,
+  const float32_t * pSrcB,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+    uint32_t blkCnt;                               /* Loop counter */
+
+    f32x4_t vec1;
+    f32x4_t vec2;
+    f32x4_t res;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+    while (blkCnt > 0U)
+    {
+        /* C = A + B */
+
+      /* Add and then store the results in the destination buffer. */
+        vec1 = vld1q(pSrcA);
+        vec2 = vld1q(pSrcB);
+        res = vmulq(vec1, vec2);
+        vst1q(pDst, res);
+
+        /* Increment pointers */
+        pSrcA += 4;
+        pSrcB += 4;
+        pDst += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x3;
+    if (blkCnt > 0U)
+    {
+      /* C = A + B */
+      mve_pred16_t p0 = vctp32q(blkCnt);
+      vec1 = vld1q(pSrcA);
+      vec2 = vld1q(pSrcB);
+      vstrwq_p(pDst, vmulq(vec1,vec2), p0);
+    }
+
+}
+
+#else
+void arm_mult_f32(
+  const float32_t * pSrcA,
+  const float32_t * pSrcB,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+    uint32_t blkCnt;                               /* Loop counter */
+
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+    f32x4_t vec1;
+    f32x4_t vec2;
+    f32x4_t res;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        /* C = A * B */
+
+    	/* Multiply the inputs and then store the results in the destination buffer. */
+        vec1 = vld1q_f32(pSrcA);
+        vec2 = vld1q_f32(pSrcB);
+        res = vmulq_f32(vec1, vec2);
+        vst1q_f32(pDst, res);
+
+        /* Increment pointers */
+        pSrcA += 4;
+        pSrcB += 4;
+        pDst += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x3;
+
+#else
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A * B */
+
+    /* Multiply inputs and store result in destination buffer. */
+    *pDst++ = (*pSrcA++) * (*pSrcB++);
+
+    *pDst++ = (*pSrcA++) * (*pSrcB++);
+
+    *pDst++ = (*pSrcA++) * (*pSrcB++);
+
+    *pDst++ = (*pSrcA++) * (*pSrcB++);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+#endif /* #if defined(ARM_MATH_NEON) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A * B */
+
+    /* Multiply input and store result in destination buffer. */
+    *pDst++ = (*pSrcA++) * (*pSrcB++);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of BasicMult group
+ */

+ 192 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/arm_mult_q15.c

@@ -0,0 +1,192 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mult_q15.c
+ * Description:  Q15 vector multiplication
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicMult
+  @{
+ */
+
+/**
+  @brief         Q15 vector multiplication
+  @param[in]     pSrcA      points to first input vector
+  @param[in]     pSrcB      points to second input vector
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
+ */
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_mult_q15(
+    const q15_t * pSrcA,
+    const q15_t * pSrcB,
+    q15_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q15x8_t vecA, vecB;
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A * B
+         * Multiply the inputs and then store the results in the destination buffer.
+         */
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vst1q(pDst, vqdmulhq(vecA, vecB));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrcA  += 8;
+        pSrcB  += 8;
+        pDst   += 8;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 7;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vstrhq_p(pDst, vqdmulhq(vecA, vecB), p0);
+    }
+}
+
+#else
+void arm_mult_q15(
+  const q15_t * pSrcA,
+  const q15_t * pSrcB,
+        q15_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+#if defined (ARM_MATH_DSP)
+  q31_t inA1, inA2, inB1, inB2;                  /* Temporary input variables */
+  q15_t out1, out2, out3, out4;                  /* Temporary output variables */
+  q31_t mul1, mul2, mul3, mul4;                  /* Temporary variables */
+#endif
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A * B */
+
+#if defined (ARM_MATH_DSP)
+    /* read 2 samples at a time from sourceA */
+    inA1 = read_q15x2_ia ((q15_t **) &pSrcA);
+    /* read 2 samples at a time from sourceB */
+    inB1 = read_q15x2_ia ((q15_t **) &pSrcB);
+    /* read 2 samples at a time from sourceA */
+    inA2 = read_q15x2_ia ((q15_t **) &pSrcA);
+    /* read 2 samples at a time from sourceB */
+    inB2 = read_q15x2_ia ((q15_t **) &pSrcB);
+
+    /* multiply mul = sourceA * sourceB */
+    mul1 = (q31_t) ((q15_t) (inA1 >> 16) * (q15_t) (inB1 >> 16));
+    mul2 = (q31_t) ((q15_t) (inA1      ) * (q15_t) (inB1      ));
+    mul3 = (q31_t) ((q15_t) (inA2 >> 16) * (q15_t) (inB2 >> 16));
+    mul4 = (q31_t) ((q15_t) (inA2      ) * (q15_t) (inB2      ));
+
+    /* saturate result to 16 bit */
+    out1 = (q15_t) __SSAT(mul1 >> 15, 16);
+    out2 = (q15_t) __SSAT(mul2 >> 15, 16);
+    out3 = (q15_t) __SSAT(mul3 >> 15, 16);
+    out4 = (q15_t) __SSAT(mul4 >> 15, 16);
+
+    /* store result to destination */
+#ifndef ARM_MATH_BIG_ENDIAN
+    write_q15x2_ia (&pDst, __PKHBT(out2, out1, 16));
+    write_q15x2_ia (&pDst, __PKHBT(out4, out3, 16));
+#else
+    write_q15x2_ia (&pDst, __PKHBT(out1, out2, 16));
+    write_q15x2_ia (&pDst, __PKHBT(out3, out4, 16));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+#else
+    *pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);
+    *pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);
+    *pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);
+    *pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A * B */
+
+    /* Multiply inputs and store result in destination buffer. */
+    *pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicMult group
+ */

+ 168 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/arm_mult_q31.c

@@ -0,0 +1,168 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mult_q31.c
+ * Description:  Q31 vector multiplication
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicMult
+  @{
+ */
+
+/**
+  @brief         Q31 vector multiplication.
+  @param[in]     pSrcA      points to the first input vector.
+  @param[in]     pSrcB      points to the second input vector.
+  @param[out]    pDst       points to the output vector.
+  @param[in]     blockSize  number of samples in each vector.
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q31 range[0x80000000 0x7FFFFFFF] are saturated.
+ */
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_mult_q31(
+    const q31_t * pSrcA,
+    const q31_t * pSrcB,
+    q31_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q31x4_t vecA, vecB;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A * B
+         * Multiply the inputs and then store the results in the destination buffer.
+         */
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vst1q(pDst, vqdmulhq(vecA, vecB));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrcA  += 4;
+        pSrcB  += 4;
+        pDst   += 4;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 3;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp32q(blkCnt);
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vstrwq_p(pDst, vqdmulhq(vecA, vecB), p0);
+    }
+}
+
+#else
+void arm_mult_q31(
+  const q31_t * pSrcA,
+  const q31_t * pSrcB,
+        q31_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+        q31_t out;                                     /* Temporary output variable */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A * B */
+
+    /* Multiply inputs and store result in destination buffer. */
+    out = ((q63_t) *pSrcA++ * *pSrcB++) >> 32;
+    out = __SSAT(out, 31);
+    *pDst++ = out << 1U;
+
+    out = ((q63_t) *pSrcA++ * *pSrcB++) >> 32;
+    out = __SSAT(out, 31);
+    *pDst++ = out << 1U;
+
+    out = ((q63_t) *pSrcA++ * *pSrcB++) >> 32;
+    out = __SSAT(out, 31);
+    *pDst++ = out << 1U;
+
+    out = ((q63_t) *pSrcA++ * *pSrcB++) >> 32;
+    out = __SSAT(out, 31);
+    *pDst++ = out << 1U;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A * B */
+
+    /* Multiply inputs and store result in destination buffer. */
+    out = ((q63_t) *pSrcA++ * *pSrcB++) >> 32;
+    out = __SSAT(out, 31);
+    *pDst++ = out << 1U;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicMult group
+ */

+ 168 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/arm_mult_q7.c

@@ -0,0 +1,168 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mult_q7.c
+ * Description:  Q7 vector multiplication
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicMult
+  @{
+ */
+
+/**
+  @brief         Q7 vector multiplication
+  @param[in]     pSrcA      points to the first input vector
+  @param[in]     pSrcB      points to the second input vector
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
+ */
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_mult_q7(
+    const q7_t * pSrcA,
+    const q7_t * pSrcB,
+    q7_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q7x16_t vecA, vecB;
+
+    /* Compute 16 outputs at a time */
+    blkCnt = blockSize >> 4;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A * B
+         * Multiply the inputs and then store the results in the destination buffer.
+         */
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vst1q(pDst, vqdmulhq(vecA, vecB));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrcA  += 16;
+        pSrcB  += 16;
+        pDst   += 16;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 0xF;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp8q(blkCnt);
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vstrbq_p(pDst, vqdmulhq(vecA, vecB), p0);
+    }
+}
+
+#else
+void arm_mult_q7(
+  const q7_t * pSrcA,
+  const q7_t * pSrcB,
+        q7_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+#if defined (ARM_MATH_DSP)
+  q7_t out1, out2, out3, out4;                   /* Temporary output variables */
+#endif
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A * B */
+
+#if defined (ARM_MATH_DSP)
+    /* Multiply inputs and store results in temporary variables */
+    out1 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
+    out2 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
+    out3 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
+    out4 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
+
+    /* Pack and store result in destination buffer (in single write) */
+    write_q7x4_ia (&pDst, __PACKq7(out1, out2, out3, out4));
+#else
+    *pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
+    *pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
+    *pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
+    *pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A * B */
+
+    /* Multiply input and store result in destination buffer. */
+    *pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicMult group
+ */

+ 192 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/arm_negate_f32.c

@@ -0,0 +1,192 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_negate_f32.c
+ * Description:  Negates floating-point vectors
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @defgroup BasicNegate Vector Negate
+
+  Negates the elements of a vector.
+
+  <pre>
+      pDst[n] = -pSrc[n],   0 <= n < blockSize.
+  </pre>
+
+  The functions support in-place computation allowing the source and
+  destination pointers to reference the same memory buffer.
+  There are separate functions for floating-point, Q7, Q15, and Q31 data types.
+ */
+
+/**
+  @addtogroup BasicNegate
+  @{
+ */
+
+/**
+  @brief         Negates the elements of a floating-point vector.
+  @param[in]     pSrc       points to input vector.
+  @param[out]    pDst       points to output vector.
+  @param[in]     blockSize  number of samples in each vector.
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_negate_f32(
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+    uint32_t blkCnt;                               /* Loop counter */
+    f32x4_t vec1;
+    f32x4_t res;
+
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+    while (blkCnt > 0U)
+    {
+        /* C = |A| */
+
+        /* Calculate absolute values and then store the results in the destination buffer. */
+        vec1 = vld1q(pSrc);
+        res = vnegq(vec1);
+        vst1q(pDst, res);
+
+        /* Increment pointers */
+        pSrc += 4;
+        pDst += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x3;
+    if (blkCnt > 0U)
+    {
+      /* C = |A| */
+      mve_pred16_t p0 = vctp32q(blkCnt);
+      vec1 = vld1q((float32_t const *) pSrc);
+      vstrwq_p(pDst, vnegq(vec1), p0);
+    }
+
+}
+
+#else
+void arm_negate_f32(
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+
+#if defined(ARM_MATH_NEON_EXPERIMENTAL) && !defined(ARM_MATH_AUTOVECTORIZE)
+    f32x4_t vec1;
+    f32x4_t res;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        /* C = -A */
+
+    	/* Negate and then store the results in the destination buffer. */
+        vec1 = vld1q_f32(pSrc);
+        res = vnegq_f32(vec1);
+        vst1q_f32(pDst, res);
+
+        /* Increment pointers */
+        pSrc += 4;
+        pDst += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x3;
+
+#else
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = -A */
+
+    /* Negate and store result in destination buffer. */
+    *pDst++ = -*pSrc++;
+
+    *pDst++ = -*pSrc++;
+
+    *pDst++ = -*pSrc++;
+
+    *pDst++ = -*pSrc++;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+#endif /* #if defined(ARM_MATH_NEON_EXPERIMENTAL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = -A */
+
+    /* Negate and store result in destination buffer. */
+    *pDst++ = -*pSrc++;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of BasicNegate group
+ */

+ 171 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/arm_negate_q15.c

@@ -0,0 +1,171 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_negate_q15.c
+ * Description:  Negates Q15 vectors
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicNegate
+  @{
+ */
+
+/**
+  @brief         Negates the elements of a Q15 vector.
+  @param[in]     pSrc       points to the input vector.
+  @param[out]    pDst       points to the output vector.
+  @param[in]     blockSize  number of samples in each vector.
+  @return        none
+
+  @par           Conditions for optimum performance
+                   Input and output buffers should be aligned by 32-bit
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   The Q15 value -1 (0x8000) is saturated to the maximum allowable positive value 0x7FFF.
+ */
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_negate_q15(
+    const q15_t  * pSrc,
+    q15_t  * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q15x8_t vecSrc;
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = -A
+         * Negate and then store the results in the destination buffer.
+         */
+        vecSrc = vld1q(pSrc);
+        vst1q(pDst, vqnegq(vecSrc));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrc += 8;
+        pDst += 8;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 7;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);
+        vecSrc = vld1q(pSrc);
+        vstrhq_p(pDst, vqnegq(vecSrc), p0);
+    }
+}
+
+#else
+void arm_negate_q15(
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+        q15_t in;                                      /* Temporary input variable */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+#if defined (ARM_MATH_DSP)
+  q31_t in1;                                    /* Temporary input variables */
+#endif
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = -A */
+
+#if defined (ARM_MATH_DSP)
+    /* Negate and store result in destination buffer (2 samples at a time). */
+    in1 = read_q15x2_ia ((q15_t **) &pSrc);
+    write_q15x2_ia (&pDst, __QSUB16(0, in1));
+
+    in1 = read_q15x2_ia ((q15_t **) &pSrc);
+    write_q15x2_ia (&pDst, __QSUB16(0, in1));
+#else
+    in = *pSrc++;
+    *pDst++ = (in == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in;
+
+    in = *pSrc++;
+    *pDst++ = (in == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in;
+
+    in = *pSrc++;
+    *pDst++ = (in == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in;
+
+    in = *pSrc++;
+    *pDst++ = (in == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in;
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = -A */
+
+    /* Negate and store result in destination buffer. */
+    in = *pSrc++;
+    *pDst++ = (in == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicNegate group
+ */

+ 178 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/arm_negate_q31.c

@@ -0,0 +1,178 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_negate_q31.c
+ * Description:  Negates Q31 vectors
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicNegate
+  @{
+ */
+
+/**
+  @brief         Negates the elements of a Q31 vector.
+  @param[in]     pSrc       points to the input vector.
+  @param[out]    pDst       points to the output vector.
+  @param[in]     blockSize   number of samples in each vector.
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   The Q31 value -1 (0x80000000) is saturated to the maximum allowable positive value 0x7FFFFFFF.
+ */
+
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_negate_q31(
+    const q31_t * pSrc,
+    q31_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q31x4_t vecSrc;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = -A
+         * Negate and then store the results in the destination buffer.
+         */
+        vecSrc = vld1q(pSrc);
+        vst1q(pDst, vqnegq(vecSrc));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrc += 4;
+        pDst += 4;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 3;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp32q(blkCnt);
+        vecSrc = vld1q(pSrc);
+        vstrwq_p(pDst, vqnegq(vecSrc), p0);
+    }
+}
+
+#else
+void arm_negate_q31(
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+        q31_t in;                                      /* Temporary input variable */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = -A */
+
+    /* Negate and store result in destination buffer. */
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = __QSUB(0, in);
+#else
+    *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
+#endif
+
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = __QSUB(0, in);
+#else
+    *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
+#endif
+
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = __QSUB(0, in);
+#else
+    *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
+#endif
+
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = __QSUB(0, in);
+#else
+    *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = -A */
+
+    /* Negate and store result in destination buffer. */
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = __QSUB(0, in);
+#else
+    *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicNegate group
+ */

+ 171 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/arm_negate_q7.c

@@ -0,0 +1,171 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_negate_q7.c
+ * Description:  Negates Q7 vectors
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicNegate
+  @{
+ */
+
+/**
+  @brief         Negates the elements of a Q7 vector.
+  @param[in]     pSrc       points to the input vector.
+  @param[out]    pDst       points to the output vector.
+  @param[in]     blockSize   number of samples in each vector.
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   The Q7 value -1 (0x80) is saturated to the maximum allowable positive value 0x7F.
+ */
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_negate_q7(
+    const q7_t   * pSrc,
+    q7_t   * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q7x16_t vecSrc;
+
+    /* Compute 16 outputs at a time */
+    blkCnt = blockSize >> 4;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = -A
+         * Negate and then store the results in the destination buffer.
+         */
+        vecSrc = vld1q(pSrc);
+        vst1q(pDst, vqnegq(vecSrc));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrc += 16;
+        pDst += 16;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 0xF;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp8q(blkCnt);
+        vecSrc = vld1q(pSrc);
+        vstrbq_p(pDst, vqnegq(vecSrc), p0);
+    }
+}
+
+#else
+void arm_negate_q7(
+  const q7_t * pSrc,
+        q7_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+        q7_t in;                                       /* Temporary input variable */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+#if defined (ARM_MATH_DSP)
+  q31_t in1;                                    /* Temporary input variable */
+#endif
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = -A */
+
+#if defined (ARM_MATH_DSP)
+    /* Negate and store result in destination buffer (4 samples at a time). */
+    in1 = read_q7x4_ia ((q7_t **) &pSrc);
+    write_q7x4_ia (&pDst, __QSUB8(0, in1));
+#else
+    in = *pSrc++;
+    *pDst++ = (in == (q7_t) 0x80) ? (q7_t) 0x7f : -in;
+
+    in = *pSrc++;
+    *pDst++ = (in == (q7_t) 0x80) ? (q7_t) 0x7f : -in;
+
+    in = *pSrc++;
+    *pDst++ = (in == (q7_t) 0x80) ? (q7_t) 0x7f : -in;
+
+    in = *pSrc++;
+    *pDst++ = (in == (q7_t) 0x80) ? (q7_t) 0x7f : -in;
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = -A */
+
+    /* Negate and store result in destination buffer. */
+    in = *pSrc++;
+
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (q7_t) __QSUB8(0, in);
+#else
+    *pDst++ = (in == (q7_t) 0x80) ? (q7_t) 0x7f : -in;
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicNegate group
+ */

+ 130 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/arm_not_u16.c

@@ -0,0 +1,130 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_not_u16.c
+ * Description:  uint16_t bitwise NOT
+ *
+ * $Date:        14 November 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @defgroup Not Vector bitwise NOT
+
+  Compute the logical bitwise NOT.
+
+  There are separate functions for uint32_t, uint16_t, and uint8_t data types.
+ */
+
+/**
+  @addtogroup Not
+  @{
+ */
+
+/**
+  @brief         Compute the logical bitwise NOT of a fixed-point vector.
+  @param[in]     pSrc       points to input vector
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+void arm_not_u16(
+    const uint16_t * pSrc,
+          uint16_t * pDst,
+          uint32_t blockSize)
+{
+    uint32_t blkCnt;      /* Loop counter */
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+    q15x8_t vecSrc;
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 3;
+
+    while (blkCnt > 0U)
+    {
+        vecSrc = vld1q(pSrc);
+
+        vst1q(pDst, vmvnq_u16(vecSrc) );
+
+        pSrc += 8;
+        pDst += 8;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 7;
+
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);
+        vecSrc = vld1q(pSrc);
+        vstrhq_p(pDst, vmvnq_u16(vecSrc), p0);
+    }
+#else
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+    uint16x8_t inV;
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 3U;
+
+    while (blkCnt > 0U)
+    {
+        inV = vld1q_u16(pSrc);
+
+        vst1q_u16(pDst, vmvnq_u16(inV) );
+
+        pSrc += 8;
+        pDst += 8;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 7;
+#else
+    /* Initialize blkCnt with number of samples */
+    blkCnt = blockSize;
+#endif
+
+    while (blkCnt > 0U)
+    {
+        *pDst++ = ~(*pSrc++);
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+#endif /* if defined(ARM_MATH_MVEI) */
+}
+
+/**
+  @} end of Not group
+ */

+ 122 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/arm_not_u32.c

@@ -0,0 +1,122 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_not_u32.c
+ * Description:  uint32_t bitwise NOT
+ *
+ * $Date:        14 November 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup Not
+  @{
+ */
+
+/**
+  @brief         Compute the logical bitwise NOT of a fixed-point vector.
+  @param[in]     pSrc       points to input vector
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+void arm_not_u32(
+    const uint32_t * pSrc,
+          uint32_t * pDst,
+          uint32_t blockSize)
+{
+    uint32_t blkCnt;      /* Loop counter */
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+    q31x4_t vecSrc;
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 2;
+
+    while (blkCnt > 0U)
+    {
+        vecSrc = vld1q(pSrc);
+
+        vst1q(pDst, vmvnq_u32(vecSrc) );
+
+        pSrc += 4;
+        pDst += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 3;
+
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp32q(blkCnt);
+        vecSrc = vld1q(pSrc);
+        vstrwq_p(pDst, vmvnq_u32(vecSrc), p0);
+    }
+#else
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+    uint32x4_t inV;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        inV = vld1q_u32(pSrc);
+
+        vst1q_u32(pDst, vmvnq_u32(inV) );
+
+        pSrc += 4;
+        pDst += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 3;
+#else
+    /* Initialize blkCnt with number of samples */
+    blkCnt = blockSize;
+#endif
+
+    while (blkCnt > 0U)
+    {
+        *pDst++ = ~(*pSrc++);
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+#endif /* if defined(ARM_MATH_MVEI) */
+}
+
+/**
+  @} end of Not group
+ */

+ 122 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/arm_not_u8.c

@@ -0,0 +1,122 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_not_u8.c
+ * Description:  uint8_t bitwise NOT
+ *
+ * $Date:        14 November 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup Not
+  @{
+ */
+
+/**
+  @brief         Compute the logical bitwise NOT of a fixed-point vector.
+  @param[in]     pSrc       points to input vector
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+void arm_not_u8(
+    const uint8_t * pSrc,
+          uint8_t * pDst,
+          uint32_t blockSize)
+{
+    uint32_t blkCnt;      /* Loop counter */
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+    q7x16_t vecSrc;
+
+    /* Compute 16 outputs at a time */
+    blkCnt = blockSize >> 4;
+
+    while (blkCnt > 0U)
+    {
+        vecSrc = vld1q(pSrc);
+
+        vst1q(pDst, vmvnq_u8(vecSrc) );
+
+        pSrc += 16;
+        pDst += 16;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0xF;
+
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp8q(blkCnt);
+        vecSrc = vld1q(pSrc);
+        vstrbq_p(pDst, vmvnq_u8(vecSrc), p0);
+    }
+#else
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+    uint8x16_t inV;
+
+    /* Compute 16 outputs at a time */
+    blkCnt = blockSize >> 4U;
+
+    while (blkCnt > 0U)
+    {
+        inV = vld1q_u8(pSrc);
+
+        vst1q_u8(pDst, vmvnq_u8(inV) );
+
+        pSrc += 16;
+        pDst += 16;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0xF;
+#else
+    /* Initialize blkCnt with number of samples */
+    blkCnt = blockSize;
+#endif
+
+    while (blkCnt > 0U)
+    {
+        *pDst++ = ~(*pSrc++);
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+#endif /* if defined(ARM_MATH_MVEI) */
+}
+
+/**
+  @} end of Not group
+ */

+ 196 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/arm_offset_f32.c

@@ -0,0 +1,196 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_offset_f32.c
+ * Description:  Floating-point vector offset
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @defgroup BasicOffset Vector Offset
+
+  Adds a constant offset to each element of a vector.
+
+  <pre>
+      pDst[n] = pSrc[n] + offset,   0 <= n < blockSize.
+  </pre>
+
+  The functions support in-place computation allowing the source and
+  destination pointers to reference the same memory buffer.
+  There are separate functions for floating-point, Q7, Q15, and Q31 data types.
+ */
+
+/**
+  @addtogroup BasicOffset
+  @{
+ */
+
+/**
+  @brief         Adds a constant offset to a floating-point vector.
+  @param[in]     pSrc       points to the input vector
+  @param[in]     offset     is the offset to be added
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_offset_f32(
+  const float32_t * pSrc,
+        float32_t offset,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+
+    f32x4_t vec1;
+    f32x4_t res;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+    while (blkCnt > 0U)
+    {
+        /* C = A + offset */
+
+        /* Add offset and then store the results in the destination buffer. */
+        vec1 = vld1q(pSrc);
+        res = vaddq(vec1,offset);
+        vst1q(pDst, res);
+
+        /* Increment pointers */
+        pSrc += 4;
+        pDst += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x3;
+
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp32q(blkCnt);
+        vec1 = vld1q((float32_t const *) pSrc);
+        vstrwq_p(pDst, vaddq(vec1, offset), p0);
+    }
+
+
+}
+
+#else
+void arm_offset_f32(
+  const float32_t * pSrc,
+        float32_t offset,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+
+#if defined(ARM_MATH_NEON_EXPERIMENTAL) && !defined(ARM_MATH_AUTOVECTORIZE)
+    f32x4_t vec1;
+    f32x4_t res;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        /* C = A + offset */
+
+        /* Add offset and then store the results in the destination buffer. */
+        vec1 = vld1q_f32(pSrc);
+        res = vaddq_f32(vec1,vdupq_n_f32(offset));
+        vst1q_f32(pDst, res);
+
+        /* Increment pointers */
+        pSrc += 4;
+        pDst += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x3;
+
+#else
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A + offset */
+
+    /* Add offset and store result in destination buffer. */
+    *pDst++ = (*pSrc++) + offset;
+
+    *pDst++ = (*pSrc++) + offset;
+
+    *pDst++ = (*pSrc++) + offset;
+
+    *pDst++ = (*pSrc++) + offset;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+#endif /* #if defined(ARM_MATH_NEON_EXPERIMENTAL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A + offset */
+
+    /* Add offset and store result in destination buffer. */
+    *pDst++ = (*pSrc++) + offset;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of BasicOffset group
+ */

+ 168 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/arm_offset_q15.c

@@ -0,0 +1,168 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_offset_q15.c
+ * Description:  Q15 vector offset
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicOffset
+  @{
+ */
+
+/**
+  @brief         Adds a constant offset to a Q15 vector.
+  @param[in]     pSrc       points to the input vector
+  @param[in]     offset     is the offset to be added
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
+ */
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_offset_q15(
+    const q15_t * pSrc,
+    q15_t   offset,
+    q15_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q15x8_t vecSrc;
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A + offset
+         * Add offset and then store the result in the destination buffer.
+         */
+        vecSrc = vld1q(pSrc);
+        vst1q(pDst, vqaddq(vecSrc, offset));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrc += 8;
+        pDst += 8;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 7;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);
+        vecSrc = vld1q(pSrc);
+        vstrhq_p(pDst, vqaddq(vecSrc, offset), p0);
+    }
+}
+
+
+#else
+void arm_offset_q15(
+  const q15_t * pSrc,
+        q15_t offset,
+        q15_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+#if defined (ARM_MATH_DSP)
+  q31_t offset_packed;                           /* Offset packed to 32 bit */
+
+  /* Offset is packed to 32 bit in order to use SIMD32 for addition */
+  offset_packed = __PKHBT(offset, offset, 16);
+#endif
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A + offset */
+
+#if defined (ARM_MATH_DSP)
+    /* Add offset and store result in destination buffer (2 samples at a time). */
+    write_q15x2_ia (&pDst, __QADD16(read_q15x2_ia ((q15_t **) &pSrc), offset_packed));
+    write_q15x2_ia (&pDst, __QADD16(read_q15x2_ia ((q15_t **) &pSrc), offset_packed));
+#else
+    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrc++ + offset), 16);
+    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrc++ + offset), 16);
+    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrc++ + offset), 16);
+    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrc++ + offset), 16);
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A + offset */
+
+    /* Add offset and store result in destination buffer. */
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (q15_t) __QADD16(*pSrc++, offset);
+#else
+    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrc++ + offset), 16);
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicOffset group
+ */

+ 175 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/arm_offset_q31.c

@@ -0,0 +1,175 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_offset_q31.c
+ * Description:  Q31 vector offset
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicOffset
+  @{
+ */
+
+/**
+  @brief         Adds a constant offset to a Q31 vector.
+  @param[in]     pSrc       points to the input vector
+  @param[in]     offset     is the offset to be added
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
+ */
+
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_offset_q31(
+    const q31_t * pSrc,
+    q31_t   offset,
+    q31_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q31x4_t vecSrc;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A + offset
+         * Add offset and then store the result in the destination buffer.
+         */
+        vecSrc = vld1q(pSrc);
+        vst1q(pDst, vqaddq(vecSrc, offset));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrc += 4;
+        pDst += 4;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 3;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp32q(blkCnt);
+        vecSrc = vld1q(pSrc);
+        vstrwq_p(pDst, vqaddq(vecSrc, offset), p0);
+    }
+}
+
+#else
+void arm_offset_q31(
+  const q31_t * pSrc,
+        q31_t offset,
+        q31_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A + offset */
+
+    /* Add offset and store result in destination buffer. */
+#if defined (ARM_MATH_DSP)
+    *pDst++ = __QADD(*pSrc++, offset);
+#else
+    *pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrc++ + offset);
+#endif
+
+#if defined (ARM_MATH_DSP)
+    *pDst++ = __QADD(*pSrc++, offset);
+#else
+    *pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrc++ + offset);
+#endif
+
+#if defined (ARM_MATH_DSP)
+    *pDst++ = __QADD(*pSrc++, offset);
+#else
+    *pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrc++ + offset);
+#endif
+
+#if defined (ARM_MATH_DSP)
+    *pDst++ = __QADD(*pSrc++, offset);
+#else
+    *pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrc++ + offset);
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A + offset */
+
+    /* Add offset and store result in destination buffer. */
+#if defined (ARM_MATH_DSP)
+    *pDst++ = __QADD(*pSrc++, offset);
+#else
+    *pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrc++ + offset);
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicOffset group
+ */

+ 162 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/arm_offset_q7.c

@@ -0,0 +1,162 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_offset_q7.c
+ * Description:  Q7 vector offset
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicOffset
+  @{
+ */
+
+/**
+  @brief         Adds a constant offset to a Q7 vector.
+  @param[in]     pSrc       points to the input vector
+  @param[in]     offset     is the offset to be added
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
+ */
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_offset_q7(
+    const q7_t * pSrc,
+    q7_t   offset,
+    q7_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q7x16_t vecSrc;
+
+    /* Compute 16 outputs at a time */
+    blkCnt = blockSize >> 4;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A + offset
+         * Add offset and then store the result in the destination buffer.
+         */
+        vecSrc = vld1q(pSrc);
+        vst1q(pDst, vqaddq(vecSrc, offset));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrc += 16;
+        pDst += 16;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 0xF;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp8q(blkCnt);
+        vecSrc = vld1q(pSrc);
+        vstrbq_p(pDst, vqaddq(vecSrc, offset), p0);
+    }
+}
+
+#else
+void arm_offset_q7(
+  const q7_t * pSrc,
+        q7_t offset,
+        q7_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+#if defined (ARM_MATH_DSP)
+  q31_t offset_packed;                           /* Offset packed to 32 bit */
+
+  /* Offset is packed to 32 bit in order to use SIMD32 for addition */
+  offset_packed = __PACKq7(offset, offset, offset, offset);
+#endif
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A + offset */
+
+#if defined (ARM_MATH_DSP)
+    /* Add offset and store result in destination buffer (4 samples at a time). */
+    write_q7x4_ia (&pDst, __QADD8(read_q7x4_ia ((q7_t **) &pSrc), offset_packed));
+#else
+    *pDst++ = (q7_t) __SSAT(*pSrc++ + offset, 8);
+    *pDst++ = (q7_t) __SSAT(*pSrc++ + offset, 8);
+    *pDst++ = (q7_t) __SSAT(*pSrc++ + offset, 8);
+    *pDst++ = (q7_t) __SSAT(*pSrc++ + offset, 8);
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A + offset */
+
+    /* Add offset and store result in destination buffer. */
+    *pDst++ = (q7_t) __SSAT((q15_t) *pSrc++ + offset, 8);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicOffset group
+ */

+ 137 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/arm_or_u16.c

@@ -0,0 +1,137 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_or_u16.c
+ * Description:  uint16_t bitwise inclusive OR
+ *
+ * $Date:        14 November 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @defgroup Or Vector bitwise inclusive OR
+
+  Compute the logical bitwise OR.
+
+  There are separate functions for uint32_t, uint16_t, and uint8_t data types.
+ */
+
+/**
+  @addtogroup Or
+  @{
+ */
+
+/**
+  @brief         Compute the logical bitwise OR of two fixed-point vectors.
+  @param[in]     pSrcA      points to input vector A
+  @param[in]     pSrcB      points to input vector B
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+void arm_or_u16(
+    const uint16_t * pSrcA,
+    const uint16_t * pSrcB,
+          uint16_t * pDst,
+          uint32_t blockSize)
+{
+    uint32_t blkCnt;      /* Loop counter */
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+    q15x8_t vecSrcA, vecSrcB;
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 3;
+
+    while (blkCnt > 0U)
+    {
+        vecSrcA = vld1q(pSrcA);
+        vecSrcB = vld1q(pSrcB);
+
+        vst1q(pDst, vorrq_u16(vecSrcA, vecSrcB) );
+
+        pSrcA += 8;
+        pSrcB += 8;
+        pDst  += 8;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 7;
+
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);
+        vecSrcA = vld1q(pSrcA);
+        vecSrcB = vld1q(pSrcB);
+        vstrhq_p(pDst, vorrq_u16(vecSrcA, vecSrcB), p0);
+    }
+#else
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+    uint16x8_t vecA, vecB;
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 3U;
+
+    while (blkCnt > 0U)
+    {
+        vecA = vld1q_u16(pSrcA);
+        vecB = vld1q_u16(pSrcB);
+
+        vst1q_u16(pDst, vorrq_u16(vecA, vecB) );
+
+        pSrcA += 8;
+        pSrcB += 8;
+        pDst  += 8;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 7;
+#else
+    /* Initialize blkCnt with number of samples */
+    blkCnt = blockSize;
+#endif
+
+    while (blkCnt > 0U)
+    {
+        *pDst++ = (*pSrcA++)|(*pSrcB++);
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+#endif /* if defined(ARM_MATH_MVEI) */
+}
+
+/**
+  @} end of Or group
+ */

+ 128 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/arm_or_u32.c

@@ -0,0 +1,128 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_or_u32.c
+ * Description:  uint32_t bitwise inclusive OR
+ *
+ * $Date:        14 November 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup Or
+  @{
+ */
+
+/**
+  @brief         Compute the logical bitwise OR of two fixed-point vectors.
+  @param[in]     pSrcA      points to input vector A
+  @param[in]     pSrcB      points to input vector B
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+void arm_or_u32(
+    const uint32_t * pSrcA,
+    const uint32_t * pSrcB,
+          uint32_t * pDst,
+          uint32_t blockSize)
+{
+    uint32_t blkCnt;      /* Loop counter */
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+    q31x4_t vecSrcA, vecSrcB;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2;
+
+    while (blkCnt > 0U)
+    {
+        vecSrcA = vld1q(pSrcA);
+        vecSrcB = vld1q(pSrcB);
+
+        vst1q(pDst, vorrq_u32(vecSrcA, vecSrcB) );
+
+        pSrcA += 4;
+        pSrcB += 4;
+        pDst  += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 3;
+
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp32q(blkCnt);
+        vecSrcA = vld1q(pSrcA);
+        vecSrcB = vld1q(pSrcB);
+        vstrwq_p(pDst, vorrq_u32(vecSrcA, vecSrcB), p0);
+    }
+#else
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+    uint32x4_t vecA, vecB;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        vecA = vld1q_u32(pSrcA);
+        vecB = vld1q_u32(pSrcB);
+
+        vst1q_u32(pDst, vorrq_u32(vecA, vecB) );
+
+        pSrcA += 4;
+        pSrcB += 4;
+        pDst  += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 3;
+#else
+    /* Initialize blkCnt with number of samples */
+    blkCnt = blockSize;
+#endif
+
+    while (blkCnt > 0U)
+    {
+        *pDst++ = (*pSrcA++)|(*pSrcB++);
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+#endif /* if defined(ARM_MATH_MVEI) */
+}
+/**
+  @} end of Or group
+ */

+ 128 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/arm_or_u8.c

@@ -0,0 +1,128 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_or_u8.c
+ * Description:  uint8_t bitwise inclusive OR
+ *
+ * $Date:        14 November 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup Or
+  @{
+ */
+
+/**
+  @brief         Compute the logical bitwise OR of two fixed-point vectors.
+  @param[in]     pSrcA      points to input vector A
+  @param[in]     pSrcB      points to input vector B
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+void arm_or_u8(
+    const uint8_t * pSrcA,
+    const uint8_t * pSrcB,
+          uint8_t * pDst,
+          uint32_t blockSize)
+{
+    uint32_t blkCnt;      /* Loop counter */
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+    q7x16_t vecSrcA, vecSrcB;
+
+    /* Compute 16 outputs at a time */
+    blkCnt = blockSize >> 4;
+
+    while (blkCnt > 0U)
+    {
+        vecSrcA = vld1q(pSrcA);
+        vecSrcB = vld1q(pSrcB);
+
+        vst1q(pDst, vorrq_u8(vecSrcA, vecSrcB) );
+
+        pSrcA += 16;
+        pSrcB += 16;
+        pDst  += 16;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0xF;
+
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp8q(blkCnt);
+        vecSrcA = vld1q(pSrcA);
+        vecSrcB = vld1q(pSrcB);
+        vstrbq_p(pDst, vorrq_u8(vecSrcA, vecSrcB), p0);
+    }
+#else
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+    uint8x16_t vecA, vecB;
+
+    /* Compute 16 outputs at a time */
+    blkCnt = blockSize >> 4U;
+
+    while (blkCnt > 0U)
+    {
+        vecA = vld1q_u8(pSrcA);
+        vecB = vld1q_u8(pSrcB);
+
+        vst1q_u8(pDst, vorrq_u8(vecA, vecB) );
+
+        pSrcA += 16;
+        pSrcB += 16;
+        pDst  += 16;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0xF;
+#else
+    /* Initialize blkCnt with number of samples */
+    blkCnt = blockSize;
+#endif
+
+    while (blkCnt > 0U)
+    {
+        *pDst++ = (*pSrcA++)|(*pSrcB++);
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+#endif /* if defined(ARM_MATH_MVEI) */
+}
+/**
+  @} end of Or group
+ */

+ 216 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/arm_scale_f32.c

@@ -0,0 +1,216 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_scale_f32.c
+ * Description:  Multiplies a floating-point vector by a scalar
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @defgroup BasicScale Vector Scale
+
+  Multiply a vector by a scalar value.  For floating-point data, the algorithm used is:
+
+  <pre>
+      pDst[n] = pSrc[n] * scale,   0 <= n < blockSize.
+  </pre>
+
+  In the fixed-point Q7, Q15, and Q31 functions, <code>scale</code> is represented by
+  a fractional multiplication <code>scaleFract</code> and an arithmetic shift <code>shift</code>.
+  The shift allows the gain of the scaling operation to exceed 1.0.
+  The algorithm used with fixed-point data is:
+
+  <pre>
+      pDst[n] = (pSrc[n] * scaleFract) << shift,   0 <= n < blockSize.
+  </pre>
+
+  The overall scale factor applied to the fixed-point data is
+  <pre>
+      scale = scaleFract * 2^shift.
+  </pre>
+
+  The functions support in-place computation allowing the source and destination
+  pointers to reference the same memory buffer.
+ */
+
+/**
+  @addtogroup BasicScale
+  @{
+ */
+
+/**
+  @brief         Multiplies a floating-point vector by a scalar.
+  @param[in]     pSrc       points to the input vector
+  @param[in]     scale      scale factor to be applied
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_scale_f32(
+  const float32_t * pSrc,
+        float32_t scale,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+
+    f32x4_t vec1;
+    f32x4_t res;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        /* C = A + offset */
+
+        /* Add offset and then store the results in the destination buffer. */
+        vec1 = vld1q(pSrc);
+        res = vmulq(vec1,scale);
+        vst1q(pDst, res);
+
+        /* Increment pointers */
+        pSrc += 4;
+        pDst += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x3;
+
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp32q(blkCnt);
+        vec1 = vld1q((float32_t const *) pSrc);
+        vstrwq_p(pDst, vmulq(vec1, scale), p0);
+    }
+
+
+}
+
+#else
+void arm_scale_f32(
+  const float32_t *pSrc,
+        float32_t scale,
+        float32_t *pDst,
+        uint32_t blockSize)
+{
+  uint32_t blkCnt;                               /* Loop counter */
+#if defined(ARM_MATH_NEON_EXPERIMENTAL)
+    f32x4_t vec1;
+    f32x4_t res;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        /* C = A * scale */
+
+    	/* Scale the input and then store the results in the destination buffer. */
+        vec1 = vld1q_f32(pSrc);
+        res = vmulq_f32(vec1, vdupq_n_f32(scale));
+        vst1q_f32(pDst, res);
+
+        /* Increment pointers */
+        pSrc += 4;
+        pDst += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x3;
+
+#else
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    float32_t in1, in2, in3, in4;
+
+    /* C = A * scale */
+
+    /* Scale input and store result in destination buffer. */
+    in1 = (*pSrc++) * scale;
+
+    in2 = (*pSrc++) * scale;
+
+    in3 = (*pSrc++) * scale;
+
+    in4 = (*pSrc++) * scale;
+
+    *pDst++ = in1;
+    *pDst++ = in2;
+    *pDst++ = in3;
+    *pDst++ = in4;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+#endif /* #if defined(ARM_MATH_NEON_EXPERIMENTAL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A * scale */
+
+    /* Scale input and store result in destination buffer. */
+    *pDst++ = (*pSrc++) * scale;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of BasicScale group
+ */

+ 201 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/arm_scale_q15.c

@@ -0,0 +1,201 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_scale_q15.c
+ * Description:  Multiplies a Q15 vector by a scalar
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicScale
+  @{
+ */
+
+/**
+  @brief         Multiplies a Q15 vector by a scalar.
+  @param[in]     pSrc       points to the input vector
+  @param[in]     scaleFract fractional portion of the scale value
+  @param[in]     shift      number of bits to shift the result by
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.15 format.
+                   These are multiplied to yield a 2.30 intermediate result and this is shifted with saturation to 1.15 format.
+ */
+
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_scale_q15(
+    const q15_t * pSrc,
+    q15_t   scaleFract,
+    int8_t  shift,
+    q15_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q15x8_t vecSrc;
+    q15x8_t vecDst;
+
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 3;
+
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A * scale
+         * Scale the input and then store the result in the destination buffer.
+         */
+        vecSrc = vld1q(pSrc);
+        vecDst = vmulhq(vecSrc, vdupq_n_s16(scaleFract));
+        vecDst = vqshlq_r(vecDst, shift + 1);
+        vst1q(pDst, vecDst);
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrc += 8;
+        pDst += 8;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 7;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);;
+        vecSrc = vld1q(pSrc);
+        vecDst = vmulhq(vecSrc, vdupq_n_s16(scaleFract));
+        vecDst = vqshlq_r(vecDst, shift + 1);
+        vstrhq_p(pDst, vecDst, p0);
+    }
+
+}
+
+
+#else
+void arm_scale_q15(
+  const q15_t *pSrc,
+        q15_t scaleFract,
+        int8_t shift,
+        q15_t *pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+        int8_t kShift = 15 - shift;                    /* Shift to apply after scaling */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+#if defined (ARM_MATH_DSP)
+  q31_t inA1, inA2;
+  q31_t out1, out2, out3, out4;                  /* Temporary output variables */
+  q15_t in1, in2, in3, in4;                      /* Temporary input variables */
+#endif
+#endif
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A * scale */
+
+#if defined (ARM_MATH_DSP)
+    /* read 2 times 2 samples at a time from source */
+    inA1 = read_q15x2_ia ((q15_t **) &pSrc);
+    inA2 = read_q15x2_ia ((q15_t **) &pSrc);
+
+    /* Scale inputs and store result in temporary variables
+     * in single cycle by packing the outputs */
+    out1 = (q31_t) ((q15_t) (inA1 >> 16) * scaleFract);
+    out2 = (q31_t) ((q15_t) (inA1      ) * scaleFract);
+    out3 = (q31_t) ((q15_t) (inA2 >> 16) * scaleFract);
+    out4 = (q31_t) ((q15_t) (inA2      ) * scaleFract);
+
+    /* apply shifting */
+    out1 = out1 >> kShift;
+    out2 = out2 >> kShift;
+    out3 = out3 >> kShift;
+    out4 = out4 >> kShift;
+
+    /* saturate the output */
+    in1 = (q15_t) (__SSAT(out1, 16));
+    in2 = (q15_t) (__SSAT(out2, 16));
+    in3 = (q15_t) (__SSAT(out3, 16));
+    in4 = (q15_t) (__SSAT(out4, 16));
+
+    /* store result to destination */
+    write_q15x2_ia (&pDst, __PKHBT(in2, in1, 16));
+    write_q15x2_ia (&pDst, __PKHBT(in4, in3, 16));
+#else
+    *pDst++ = (q15_t) (__SSAT(((q31_t) *pSrc++ * scaleFract) >> kShift, 16));
+    *pDst++ = (q15_t) (__SSAT(((q31_t) *pSrc++ * scaleFract) >> kShift, 16));
+    *pDst++ = (q15_t) (__SSAT(((q31_t) *pSrc++ * scaleFract) >> kShift, 16));
+    *pDst++ = (q15_t) (__SSAT(((q31_t) *pSrc++ * scaleFract) >> kShift, 16));
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A * scale */
+
+    /* Scale input and store result in destination buffer. */
+    *pDst++ = (q15_t) (__SSAT(((q31_t) *pSrc++ * scaleFract) >> kShift, 16));
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicScale group
+ */

+ 244 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/arm_scale_q31.c

@@ -0,0 +1,244 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_scale_q31.c
+ * Description:  Multiplies a Q31 vector by a scalar
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicScale
+  @{
+ */
+
+/**
+  @brief         Multiplies a Q31 vector by a scalar.
+  @param[in]     pSrc       points to the input vector
+  @param[in]     scaleFract fractional portion of the scale value
+  @param[in]     shift      number of bits to shift the result by
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.31 format.
+                   These are multiplied to yield a 2.62 intermediate result and this is shifted with saturation to 1.31 format.
+ */
+
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_scale_q31(
+    const q31_t * pSrc,
+    q31_t   scaleFract,
+    int8_t  shift,
+    q31_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q31x4_t vecSrc;
+    q31x4_t vecDst;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A * scale
+         * Scale the input and then store the result in the destination buffer.
+         */
+        vecSrc = vld1q(pSrc);
+        vecDst = vmulhq(vecSrc, vdupq_n_s32(scaleFract));
+        vecDst = vqshlq_r(vecDst, shift + 1);
+        vst1q(pDst, vecDst);
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrc += 4;
+        pDst += 4;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 3;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp32q(blkCnt);
+        vecSrc = vld1q(pSrc);
+        vecDst = vmulhq(vecSrc, vdupq_n_s32(scaleFract));
+        vecDst = vqshlq_r(vecDst, shift + 1);
+        vstrwq_p(pDst, vecDst, p0);
+    }
+}
+
+#else
+void arm_scale_q31(
+  const q31_t *pSrc,
+        q31_t scaleFract,
+        int8_t shift,
+        q31_t *pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+        q31_t in, out;                                 /* Temporary variables */
+        int8_t kShift = shift + 1;                     /* Shift to apply after scaling */
+        int8_t sign = (kShift & 0x80);
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  if (sign == 0U)
+  {
+    while (blkCnt > 0U)
+    {
+      /* C = A * scale */
+
+      /* Scale input and store result in destination buffer. */
+      in = *pSrc++;                                /* read input from source */
+      in = ((q63_t) in * scaleFract) >> 32;        /* multiply input with scaler value */
+      out = in << kShift;                          /* apply shifting */
+      if (in != (out >> kShift))                   /* saturate the result */
+        out = 0x7FFFFFFF ^ (in >> 31);
+      *pDst++ = out;                               /* Store result destination */
+
+      in = *pSrc++;
+      in = ((q63_t) in * scaleFract) >> 32;
+      out = in << kShift;
+      if (in != (out >> kShift))
+        out = 0x7FFFFFFF ^ (in >> 31);
+      *pDst++ = out;
+
+      in = *pSrc++;
+      in = ((q63_t) in * scaleFract) >> 32;
+      out = in << kShift;
+      if (in != (out >> kShift))
+        out = 0x7FFFFFFF ^ (in >> 31);
+      *pDst++ = out;
+
+      in = *pSrc++;
+      in = ((q63_t) in * scaleFract) >> 32;
+      out = in << kShift;
+      if (in != (out >> kShift))
+        out = 0x7FFFFFFF ^ (in >> 31);
+      *pDst++ = out;
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+  }
+  else
+  {
+    while (blkCnt > 0U)
+    {
+      /* C = A * scale */
+
+      /* Scale input and store result in destination buffer. */
+      in = *pSrc++;                                /* read four inputs from source */
+      in = ((q63_t) in * scaleFract) >> 32;        /* multiply input with scaler value */
+      out = in >> -kShift;                         /* apply shifting */
+      *pDst++ = out;                               /* Store result destination */
+
+      in = *pSrc++;
+      in = ((q63_t) in * scaleFract) >> 32;
+      out = in >> -kShift;
+      *pDst++ = out;
+
+      in = *pSrc++;
+      in = ((q63_t) in * scaleFract) >> 32;
+      out = in >> -kShift;
+      *pDst++ = out;
+
+      in = *pSrc++;
+      in = ((q63_t) in * scaleFract) >> 32;
+      out = in >> -kShift;
+      *pDst++ = out;
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  if (sign == 0U)
+  {
+    while (blkCnt > 0U)
+    {
+      /* C = A * scale */
+
+      /* Scale input and store result in destination buffer. */
+      in = *pSrc++;
+      in = ((q63_t) in * scaleFract) >> 32;
+      out = in << kShift;
+      if (in != (out >> kShift))
+          out = 0x7FFFFFFF ^ (in >> 31);
+      *pDst++ = out;
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+  }
+  else
+  {
+    while (blkCnt > 0U)
+    {
+      /* C = A * scale */
+
+      /* Scale input and store result in destination buffer. */
+      in = *pSrc++;
+      in = ((q63_t) in * scaleFract) >> 32;
+      out = in >> -kShift;
+      *pDst++ = out;
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicScale group
+ */

+ 186 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/arm_scale_q7.c

@@ -0,0 +1,186 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_scale_q7.c
+ * Description:  Multiplies a Q7 vector by a scalar
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicScale
+  @{
+ */
+
+/**
+  @brief         Multiplies a Q7 vector by a scalar.
+  @param[in]     pSrc       points to the input vector
+  @param[in]     scaleFract fractional portion of the scale value
+  @param[in]     shift      number of bits to shift the result by
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.7 format.
+                   These are multiplied to yield a 2.14 intermediate result and this is shifted with saturation to 1.7 format.
+ */
+
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+
+void arm_scale_q7(
+    const q7_t * pSrc,
+    q7_t   scaleFract,
+    int8_t  shift,
+    q7_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q7x16_t vecSrc;
+    q7x16_t vecDst;
+
+
+    /* Compute 16 outputs at a time */
+    blkCnt = blockSize >> 4;
+
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A * scale
+         * Scale the input and then store the result in the destination buffer.
+         */
+        vecSrc = vld1q(pSrc);
+        vecDst = vmulhq(vecSrc, vdupq_n_s8(scaleFract));
+        vecDst = vqshlq_r(vecDst, shift + 1);
+        vst1q(pDst, vecDst);
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrc += 16;
+        pDst += 16;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 0xF;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp8q(blkCnt);
+        vecSrc = vld1q(pSrc);
+        vecDst = vmulhq(vecSrc, vdupq_n_s8(scaleFract));
+        vecDst = vqshlq_r(vecDst, shift + 1);
+        vstrbq_p(pDst, vecDst, p0);
+    }
+
+}
+
+#else
+void arm_scale_q7(
+  const q7_t * pSrc,
+        q7_t scaleFract,
+        int8_t shift,
+        q7_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+        int8_t kShift = 7 - shift;                     /* Shift to apply after scaling */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+#if defined (ARM_MATH_DSP)
+  q7_t in1,  in2,  in3,  in4;                    /* Temporary input variables */
+  q7_t out1, out2, out3, out4;                   /* Temporary output variables */
+#endif
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A * scale */
+
+#if defined (ARM_MATH_DSP)
+    /* Reading 4 inputs from memory */
+    in1 = *pSrc++;
+    in2 = *pSrc++;
+    in3 = *pSrc++;
+    in4 = *pSrc++;
+
+    /* Scale inputs and store result in the temporary variable. */
+    out1 = (q7_t) (__SSAT(((in1) * scaleFract) >> kShift, 8));
+    out2 = (q7_t) (__SSAT(((in2) * scaleFract) >> kShift, 8));
+    out3 = (q7_t) (__SSAT(((in3) * scaleFract) >> kShift, 8));
+    out4 = (q7_t) (__SSAT(((in4) * scaleFract) >> kShift, 8));
+
+    /* Pack and store result in destination buffer (in single write) */
+    write_q7x4_ia (&pDst, __PACKq7(out1, out2, out3, out4));
+#else
+    *pDst++ = (q7_t) (__SSAT((((q15_t) *pSrc++ * scaleFract) >> kShift), 8));
+    *pDst++ = (q7_t) (__SSAT((((q15_t) *pSrc++ * scaleFract) >> kShift), 8));
+    *pDst++ = (q7_t) (__SSAT((((q15_t) *pSrc++ * scaleFract) >> kShift), 8));
+    *pDst++ = (q7_t) (__SSAT((((q15_t) *pSrc++ * scaleFract) >> kShift), 8));
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A * scale */
+
+    /* Scale input and store result in destination buffer. */
+    *pDst++ = (q7_t) (__SSAT((((q15_t) *pSrc++ * scaleFract) >> kShift), 8));
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicScale group
+ */

+ 251 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/arm_shift_q15.c

@@ -0,0 +1,251 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_shift_q15.c
+ * Description:  Shifts the elements of a Q15 vector by a specified number of bits
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicShift
+  @{
+ */
+
+/**
+  @brief         Shifts the elements of a Q15 vector a specified number of bits
+  @param[in]     pSrc       points to the input vector
+  @param[in]     shiftBits  number of bits to shift.  A positive value shifts left; a negative value shifts right.
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
+ */
+
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_shift_q15(
+    const q15_t * pSrc,
+    int8_t shiftBits,
+    q15_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q15x8_t vecSrc;
+    q15x8_t vecDst;
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A (>> or <<) shiftBits
+         * Shift the input and then store the result in the destination buffer.
+         */
+        vecSrc = vld1q(pSrc);
+        vecDst = vqshlq_r(vecSrc, shiftBits);
+        vst1q(pDst, vecDst);
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrc += 8;
+        pDst += 8;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 7;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);
+        vecSrc = vld1q(pSrc);
+        vecDst = vqshlq_r(vecSrc, shiftBits);
+        vstrhq_p(pDst, vecDst, p0);
+    }
+}
+
+#else
+void arm_shift_q15(
+  const q15_t * pSrc,
+        int8_t shiftBits,
+        q15_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+        uint8_t sign = (shiftBits & 0x80);             /* Sign of shiftBits */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+#if defined (ARM_MATH_DSP)
+  q15_t in1, in2;                                /* Temporary input variables */
+#endif
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  /* If the shift value is positive then do right shift else left shift */
+  if (sign == 0U)
+  {
+    while (blkCnt > 0U)
+    {
+      /* C = A << shiftBits */
+
+#if defined (ARM_MATH_DSP)
+      /* read 2 samples from source */
+      in1 = *pSrc++;
+      in2 = *pSrc++;
+
+      /* Shift the inputs and then store the results in the destination buffer. */
+#ifndef ARM_MATH_BIG_ENDIAN
+      write_q15x2_ia (&pDst, __PKHBT(__SSAT((in1 << shiftBits), 16),
+                                     __SSAT((in2 << shiftBits), 16), 16));
+#else
+      write_q15x2_ia (&pDst, __PKHBT(__SSAT((in2 << shiftBits), 16),
+                                      __SSAT((in1 << shiftBits), 16), 16));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+      /* read 2 samples from source */
+      in1 = *pSrc++;
+      in2 = *pSrc++;
+
+#ifndef ARM_MATH_BIG_ENDIAN
+      write_q15x2_ia (&pDst, __PKHBT(__SSAT((in1 << shiftBits), 16),
+                                     __SSAT((in2 << shiftBits), 16), 16));
+#else
+      write_q15x2_ia (&pDst, __PKHBT(__SSAT((in2 << shiftBits), 16),
+                                     __SSAT((in1 << shiftBits), 16), 16));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+#else
+      *pDst++ = __SSAT(((q31_t) *pSrc++ << shiftBits), 16);
+      *pDst++ = __SSAT(((q31_t) *pSrc++ << shiftBits), 16);
+      *pDst++ = __SSAT(((q31_t) *pSrc++ << shiftBits), 16);
+      *pDst++ = __SSAT(((q31_t) *pSrc++ << shiftBits), 16);
+#endif
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+  }
+  else
+  {
+    while (blkCnt > 0U)
+    {
+      /* C = A >> shiftBits */
+
+#if defined (ARM_MATH_DSP)
+      /* read 2 samples from source */
+      in1 = *pSrc++;
+      in2 = *pSrc++;
+
+      /* Shift the inputs and then store the results in the destination buffer. */
+#ifndef ARM_MATH_BIG_ENDIAN
+      write_q15x2_ia (&pDst, __PKHBT((in1 >> -shiftBits),
+                                     (in2 >> -shiftBits), 16));
+#else
+      write_q15x2_ia (&pDst, __PKHBT((in2 >> -shiftBits),
+                                     (in1 >> -shiftBits), 16));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+      /* read 2 samples from source */
+      in1 = *pSrc++;
+      in2 = *pSrc++;
+
+#ifndef ARM_MATH_BIG_ENDIAN
+      write_q15x2_ia (&pDst, __PKHBT((in1 >> -shiftBits),
+                                     (in2 >> -shiftBits), 16));
+#else
+      write_q15x2_ia (&pDst, __PKHBT((in2 >> -shiftBits),
+                                     (in1 >> -shiftBits), 16));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+#else
+      *pDst++ = (*pSrc++ >> -shiftBits);
+      *pDst++ = (*pSrc++ >> -shiftBits);
+      *pDst++ = (*pSrc++ >> -shiftBits);
+      *pDst++ = (*pSrc++ >> -shiftBits);
+#endif
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  /* If the shift value is positive then do right shift else left shift */
+  if (sign == 0U)
+  {
+    while (blkCnt > 0U)
+    {
+      /* C = A << shiftBits */
+
+      /* Shift input and store result in destination buffer. */
+      *pDst++ = __SSAT(((q31_t) *pSrc++ << shiftBits), 16);
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+  }
+  else
+  {
+    while (blkCnt > 0U)
+    {
+      /* C = A >> shiftBits */
+
+      /* Shift input and store result in destination buffer. */
+      *pDst++ = (*pSrc++ >> -shiftBits);
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicShift group
+ */

+ 232 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/arm_shift_q31.c

@@ -0,0 +1,232 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_shift_q31.c
+ * Description:  Shifts the elements of a Q31 vector by a specified number of bits
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+/**
+  @defgroup BasicShift Vector Shift
+
+  Shifts the elements of a fixed-point vector by a specified number of bits.
+  There are separate functions for Q7, Q15, and Q31 data types.
+  The underlying algorithm used is:
+
+  <pre>
+      pDst[n] = pSrc[n] << shift,   0 <= n < blockSize.
+  </pre>
+
+  If <code>shift</code> is positive then the elements of the vector are shifted to the left.
+  If <code>shift</code> is negative then the elements of the vector are shifted to the right.
+
+  The functions support in-place computation allowing the source and destination
+  pointers to reference the same memory buffer.
+ */
+
+/**
+  @addtogroup BasicShift
+  @{
+ */
+
+/**
+  @brief         Shifts the elements of a Q31 vector a specified number of bits.
+  @param[in]     pSrc       points to the input vector
+  @param[in]     shiftBits  number of bits to shift.  A positive value shifts left; a negative value shifts right.
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in the vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
+ */
+
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_shift_q31(
+    const q31_t * pSrc,
+    int8_t shiftBits,
+    q31_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q31x4_t vecSrc;
+    q31x4_t vecDst;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A (>> or <<) shiftBits
+         * Shift the input and then store the result in the destination buffer.
+         */
+        vecSrc = vld1q((q31_t const *) pSrc);
+        vecDst = vqshlq_r(vecSrc, shiftBits);
+        vst1q(pDst, vecDst);
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrc += 4;
+        pDst += 4;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 3;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp32q(blkCnt);
+        vecSrc = vld1q((q31_t const *) pSrc);
+        vecDst = vqshlq_r(vecSrc, shiftBits);
+        vstrwq_p(pDst, vecDst, p0);
+    }
+}
+
+
+#else
+void arm_shift_q31(
+  const q31_t * pSrc,
+        int8_t shiftBits,
+        q31_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+        uint8_t sign = (shiftBits & 0x80);             /* Sign of shiftBits */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  q31_t in, out;                                 /* Temporary variables */
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  /* If the shift value is positive then do right shift else left shift */
+  if (sign == 0U)
+  {
+    while (blkCnt > 0U)
+    {
+      /* C = A << shiftBits */
+
+      /* Shift input and store result in destination buffer. */
+      in = *pSrc++;
+      out = in << shiftBits;
+      if (in != (out >> shiftBits))
+        out = 0x7FFFFFFF ^ (in >> 31);
+      *pDst++ = out;
+
+      in = *pSrc++;
+      out = in << shiftBits;
+      if (in != (out >> shiftBits))
+        out = 0x7FFFFFFF ^ (in >> 31);
+      *pDst++ = out;
+
+      in = *pSrc++;
+      out = in << shiftBits;
+      if (in != (out >> shiftBits))
+        out = 0x7FFFFFFF ^ (in >> 31);
+      *pDst++ = out;
+
+      in = *pSrc++;
+      out = in << shiftBits;
+      if (in != (out >> shiftBits))
+        out = 0x7FFFFFFF ^ (in >> 31);
+      *pDst++ = out;
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+  }
+  else
+  {
+    while (blkCnt > 0U)
+    {
+      /* C = A >> shiftBits */
+
+      /* Shift input and store results in destination buffer. */
+      *pDst++ = (*pSrc++ >> -shiftBits);
+      *pDst++ = (*pSrc++ >> -shiftBits);
+      *pDst++ = (*pSrc++ >> -shiftBits);
+      *pDst++ = (*pSrc++ >> -shiftBits);
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  /* If the shift value is positive then do right shift else left shift */
+  if (sign == 0U)
+  {
+    while (blkCnt > 0U)
+    {
+      /* C = A << shiftBits */
+
+      /* Shift input and store result in destination buffer. */
+      *pDst++ = clip_q63_to_q31((q63_t) *pSrc++ << shiftBits);
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+  }
+  else
+  {
+    while (blkCnt > 0U)
+    {
+      /* C = A >> shiftBits */
+
+      /* Shift input and store result in destination buffer. */
+      *pDst++ = (*pSrc++ >> -shiftBits);
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicShift group
+ */

+ 225 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/arm_shift_q7.c

@@ -0,0 +1,225 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_shift_q7.c
+ * Description:  Processing function for the Q7 Shifting
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicShift
+  @{
+ */
+
+/**
+  @brief         Shifts the elements of a Q7 vector a specified number of bits
+  @param[in]     pSrc       points to the input vector
+  @param[in]     shiftBits  number of bits to shift.  A positive value shifts left; a negative value shifts right.
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           onditions for optimum performance
+                   Input and output buffers should be aligned by 32-bit
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
+ */
+
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_shift_q7(
+    const q7_t * pSrc,
+    int8_t shiftBits,
+    q7_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q7x16_t vecSrc;
+    q7x16_t vecDst;
+
+    /* Compute 16 outputs at a time */
+    blkCnt = blockSize >> 4;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A (>> or <<) shiftBits
+         * Shift the input and then store the result in the destination buffer.
+         */
+        vecSrc = vld1q(pSrc);
+        vecDst = vqshlq_r(vecSrc, shiftBits);
+        vst1q(pDst, vecDst);
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrc += 16;
+        pDst += 16;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 0xF;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp8q(blkCnt);
+        vecSrc = vld1q(pSrc);
+        vecDst = vqshlq_r(vecSrc, shiftBits);
+        vstrbq_p(pDst, vecDst, p0);
+    }
+}
+
+#else
+void arm_shift_q7(
+  const q7_t * pSrc,
+        int8_t shiftBits,
+        q7_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+        uint8_t sign = (shiftBits & 0x80);             /* Sign of shiftBits */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+#if defined (ARM_MATH_DSP)
+  q7_t in1,  in2,  in3,  in4;                    /* Temporary input variables */
+#endif
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  /* If the shift value is positive then do right shift else left shift */
+  if (sign == 0U)
+  {
+    while (blkCnt > 0U)
+    {
+      /* C = A << shiftBits */
+
+#if defined (ARM_MATH_DSP)
+      /* Read 4 inputs */
+      in1 = *pSrc++;
+      in2 = *pSrc++;
+      in3 = *pSrc++;
+      in4 = *pSrc++;
+
+    /* Pack and store result in destination buffer (in single write) */
+      write_q7x4_ia (&pDst, __PACKq7(__SSAT((in1 << shiftBits), 8),
+                                     __SSAT((in2 << shiftBits), 8),
+                                     __SSAT((in3 << shiftBits), 8),
+                                     __SSAT((in4 << shiftBits), 8) ));
+#else
+      *pDst++ = (q7_t) __SSAT(((q15_t) *pSrc++ << shiftBits), 8);
+      *pDst++ = (q7_t) __SSAT(((q15_t) *pSrc++ << shiftBits), 8);
+      *pDst++ = (q7_t) __SSAT(((q15_t) *pSrc++ << shiftBits), 8);
+      *pDst++ = (q7_t) __SSAT(((q15_t) *pSrc++ << shiftBits), 8);
+#endif
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+  }
+  else
+  {
+    while (blkCnt > 0U)
+    {
+      /* C = A >> shiftBits */
+
+#if defined (ARM_MATH_DSP)
+      /* Read 4 inputs */
+      in1 = *pSrc++;
+      in2 = *pSrc++;
+      in3 = *pSrc++;
+      in4 = *pSrc++;
+
+    /* Pack and store result in destination buffer (in single write) */
+      write_q7x4_ia (&pDst, __PACKq7((in1 >> -shiftBits),
+                                     (in2 >> -shiftBits),
+                                     (in3 >> -shiftBits),
+                                     (in4 >> -shiftBits) ));
+#else
+      *pDst++ = (*pSrc++ >> -shiftBits);
+      *pDst++ = (*pSrc++ >> -shiftBits);
+      *pDst++ = (*pSrc++ >> -shiftBits);
+      *pDst++ = (*pSrc++ >> -shiftBits);
+#endif
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  /* If the shift value is positive then do right shift else left shift */
+  if (sign == 0U)
+  {
+    while (blkCnt > 0U)
+    {
+      /* C = A << shiftBits */
+
+      /* Shift input and store result in destination buffer. */
+      *pDst++ = (q7_t) __SSAT(((q15_t) *pSrc++ << shiftBits), 8);
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+  }
+  else
+  {
+    while (blkCnt > 0U)
+    {
+      /* C = A >> shiftBits */
+
+      /* Shift input and store result in destination buffer. */
+      *pDst++ = (*pSrc++ >> -shiftBits);
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicShift group
+ */

+ 202 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/arm_sub_f32.c

@@ -0,0 +1,202 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_sub_f32.c
+ * Description:  Floating-point vector subtraction
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @defgroup BasicSub Vector Subtraction
+
+  Element-by-element subtraction of two vectors.
+
+  <pre>
+      pDst[n] = pSrcA[n] - pSrcB[n],   0 <= n < blockSize.
+  </pre>
+
+  There are separate functions for floating-point, Q7, Q15, and Q31 data types.
+ */
+
+/**
+  @addtogroup BasicSub
+  @{
+ */
+
+/**
+  @brief         Floating-point vector subtraction.
+  @param[in]     pSrcA      points to the first input vector
+  @param[in]     pSrcB      points to the second input vector
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_sub_f32(
+  const float32_t * pSrcA,
+  const float32_t * pSrcB,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+    uint32_t blkCnt;                               /* Loop counter */
+
+    f32x4_t vec1;
+    f32x4_t vec2;
+    f32x4_t res;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        /* C = A + B */
+
+      /* Add and then store the results in the destination buffer. */
+        vec1 = vld1q(pSrcA);
+        vec2 = vld1q(pSrcB);
+        res = vsubq(vec1, vec2);
+        vst1q(pDst, res);
+
+        /* Increment pointers */
+        pSrcA += 4;
+        pSrcB += 4;
+        pDst += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x3;
+
+    if (blkCnt > 0U)
+    {
+      /* C = A + B */
+      mve_pred16_t p0 = vctp32q(blkCnt);
+      vec1 = vld1q(pSrcA);
+      vec2 = vld1q(pSrcB);
+      vstrwq_p(pDst, vsubq(vec1,vec2), p0);
+    }
+
+}
+
+#else
+void arm_sub_f32(
+  const float32_t * pSrcA,
+  const float32_t * pSrcB,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+    f32x4_t vec1;
+    f32x4_t vec2;
+    f32x4_t res;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        /* C = A - B */
+
+        /* Subtract and then store the results in the destination buffer. */
+        vec1 = vld1q_f32(pSrcA);
+        vec2 = vld1q_f32(pSrcB);
+        res = vsubq_f32(vec1, vec2);
+        vst1q_f32(pDst, res);
+
+        /* Increment pointers */
+        pSrcA += 4;
+        pSrcB += 4;
+        pDst += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x3;
+
+#else
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A - B */
+
+    /* Subtract and store result in destination buffer. */
+    *pDst++ = (*pSrcA++) - (*pSrcB++);
+
+    *pDst++ = (*pSrcA++) - (*pSrcB++);
+
+    *pDst++ = (*pSrcA++) - (*pSrcB++);
+
+    *pDst++ = (*pSrcA++) - (*pSrcB++);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+#endif /* #if defined(ARM_MATH_NEON) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A - B */
+
+    /* Subtract and store result in destination buffer. */
+    *pDst++ = (*pSrcA++) - (*pSrcB++);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of BasicSub group
+ */

+ 178 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/arm_sub_q15.c

@@ -0,0 +1,178 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_sub_q15.c
+ * Description:  Q15 vector subtraction
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicSub
+  @{
+ */
+
+/**
+  @brief         Q15 vector subtraction.
+  @param[in]     pSrcA      points to the first input vector
+  @param[in]     pSrcB      points to the second input vector
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
+ */
+
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_sub_q15(
+    const q15_t * pSrcA,
+    const q15_t * pSrcB,
+    q15_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q15x8_t vecA;
+    q15x8_t vecB;
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A - B
+         * Subtract and then store the results in the destination buffer.
+         */
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vst1q(pDst, vqsubq(vecA, vecB));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrcA  += 8;
+        pSrcB  += 8;
+        pDst   += 8;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 7;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vstrhq_p(pDst, vqsubq(vecA, vecB), p0);
+    }
+}
+
+
+#else
+void arm_sub_q15(
+  const q15_t * pSrcA,
+  const q15_t * pSrcB,
+        q15_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+#if defined (ARM_MATH_DSP)
+  q31_t inA1, inA2;
+  q31_t inB1, inB2;
+#endif
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A - B */
+
+#if defined (ARM_MATH_DSP)
+    /* read 2 times 2 samples at a time from sourceA */
+    inA1 = read_q15x2_ia ((q15_t **) &pSrcA);
+    inA2 = read_q15x2_ia ((q15_t **) &pSrcA);
+    /* read 2 times 2 samples at a time from sourceB */
+    inB1 = read_q15x2_ia ((q15_t **) &pSrcB);
+    inB2 = read_q15x2_ia ((q15_t **) &pSrcB);
+
+    /* Subtract and store 2 times 2 samples at a time */
+    write_q15x2_ia (&pDst, __QSUB16(inA1, inB1));
+    write_q15x2_ia (&pDst, __QSUB16(inA2, inB2));
+#else
+    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
+    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
+    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
+    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A - B */
+
+    /* Subtract and store result in destination buffer. */
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (q15_t) __QSUB16(*pSrcA++, *pSrcB++);
+#else
+    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicSub group
+ */

+ 159 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/arm_sub_q31.c

@@ -0,0 +1,159 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_sub_q31.c
+ * Description:  Q31 vector subtraction
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicSub
+  @{
+ */
+
+/**
+  @brief         Q31 vector subtraction.
+  @param[in]     pSrcA      points to the first input vector
+  @param[in]     pSrcB      points to the second input vector
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
+ */
+
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_sub_q31(
+  const q31_t * pSrcA,
+  const q31_t * pSrcB,
+        q31_t * pDst,
+        uint32_t blockSize)
+{
+    uint32_t blkCnt;
+    q31x4_t vecA;
+    q31x4_t vecB;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A + B
+         * Add and then store the results in the destination buffer.
+         */
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vst1q(pDst, vqsubq(vecA, vecB));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrcA  += 4;
+        pSrcB  += 4;
+        pDst   += 4;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 3;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp32q(blkCnt);
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vstrwq_p(pDst, vqsubq(vecA, vecB), p0);
+    }
+}
+
+#else
+void arm_sub_q31(
+  const q31_t * pSrcA,
+  const q31_t * pSrcB,
+        q31_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A - B */
+
+    /* Subtract and store result in destination buffer. */
+    *pDst++ = __QSUB(*pSrcA++, *pSrcB++);
+
+    *pDst++ = __QSUB(*pSrcA++, *pSrcB++);
+
+    *pDst++ = __QSUB(*pSrcA++, *pSrcB++);
+
+    *pDst++ = __QSUB(*pSrcA++, *pSrcB++);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A - B */
+
+    /* Subtract and store result in destination buffer. */
+    *pDst++ = __QSUB(*pSrcA++, *pSrcB++);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicSub group
+ */

+ 158 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/arm_sub_q7.c

@@ -0,0 +1,158 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_sub_q7.c
+ * Description:  Q7 vector subtraction
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicSub
+  @{
+ */
+
+/**
+  @brief         Q7 vector subtraction.
+  @param[in]     pSrcA      points to the first input vector
+  @param[in]     pSrcB      points to the second input vector
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q7 range [0x80 0x7F] will be saturated.
+ */
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_sub_q7(
+    const q7_t * pSrcA,
+    const q7_t * pSrcB,
+    q7_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q7x16_t vecA;
+    q7x16_t vecB;
+
+    /* Compute 16 outputs at a time */
+    blkCnt = blockSize >> 4;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A - B
+         * Subtract and then store the results in the destination buffer.
+         */
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vst1q(pDst, vqsubq(vecA, vecB));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrcA  += 16;
+        pSrcB  += 16;
+        pDst   += 16;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 0xF;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp8q(blkCnt);
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vstrbq_p(pDst, vqsubq(vecA, vecB), p0);
+    }
+}
+#else
+void arm_sub_q7(
+  const q7_t * pSrcA,
+  const q7_t * pSrcB,
+        q7_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A - B */
+
+#if defined (ARM_MATH_DSP)
+    /* Subtract and store result in destination buffer (4 samples at a time). */
+    write_q7x4_ia (&pDst, __QSUB8(read_q7x4_ia ((q7_t **) &pSrcA), read_q7x4_ia ((q7_t **) &pSrcB)));
+#else
+    *pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ - *pSrcB++, 8);
+    *pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ - *pSrcB++, 8);
+    *pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ - *pSrcB++, 8);
+    *pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ - *pSrcB++, 8);
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A - B */
+
+    /* Subtract and store result in destination buffer. */
+    *pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ - *pSrcB++, 8);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicSub group
+ */

+ 137 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/arm_xor_u16.c

@@ -0,0 +1,137 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_xor_u16.c
+ * Description:  uint16_t bitwise exclusive OR
+ *
+ * $Date:        14 November 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @defgroup Xor Vector bitwise exclusive OR
+
+  Compute the logical bitwise XOR.
+
+  There are separate functions for uint32_t, uint16_t, and uint8_t data types.
+ */
+
+/**
+  @addtogroup Xor
+  @{
+ */
+
+/**
+  @brief         Compute the logical bitwise XOR of two fixed-point vectors.
+  @param[in]     pSrcA      points to input vector A
+  @param[in]     pSrcB      points to input vector B
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+void arm_xor_u16(
+    const uint16_t * pSrcA,
+    const uint16_t * pSrcB,
+          uint16_t * pDst,
+          uint32_t blockSize)
+{
+    uint32_t blkCnt;      /* Loop counter */
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+    q15x8_t vecSrcA, vecSrcB;
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 3;
+
+    while (blkCnt > 0U)
+    {
+        vecSrcA = vld1q(pSrcA);
+        vecSrcB = vld1q(pSrcB);
+
+        vst1q(pDst, veorq_u16(vecSrcA, vecSrcB) );
+
+        pSrcA += 8;
+        pSrcB += 8;
+        pDst  += 8;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 7;
+
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);
+        vecSrcA = vld1q(pSrcA);
+        vecSrcB = vld1q(pSrcB);
+        vstrhq_p(pDst, veorq_u16(vecSrcA, vecSrcB), p0);
+    }
+#else
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+    uint16x8_t vecA, vecB;
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 3U;
+
+    while (blkCnt > 0U)
+    {
+        vecA = vld1q_u16(pSrcA);
+        vecB = vld1q_u16(pSrcB);
+
+        vst1q_u16(pDst, veorq_u16(vecA, vecB) );
+
+        pSrcA += 8;
+        pSrcB += 8;
+        pDst  += 8;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 7;
+#else
+    /* Initialize blkCnt with number of samples */
+    blkCnt = blockSize;
+#endif
+
+    while (blkCnt > 0U)
+    {
+        *pDst++ = (*pSrcA++)^(*pSrcB++);
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+#endif /* if defined(ARM_MATH_MVEI) */
+}
+
+/**
+  @} end of Xor group
+ */

+ 129 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/arm_xor_u32.c

@@ -0,0 +1,129 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_xor_u32.c
+ * Description:  uint32_t bitwise exclusive OR
+ *
+ * $Date:        14 November 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup Xor
+  @{
+ */
+
+/**
+  @brief         Compute the logical bitwise XOR of two fixed-point vectors.
+  @param[in]     pSrcA      points to input vector A
+  @param[in]     pSrcB      points to input vector B
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+void arm_xor_u32(
+    const uint32_t * pSrcA,
+    const uint32_t * pSrcB,
+          uint32_t * pDst,
+          uint32_t blockSize)
+{
+    uint32_t blkCnt;      /* Loop counter */
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+    q31x4_t vecSrcA, vecSrcB;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2;
+
+    while (blkCnt > 0U)
+    {
+        vecSrcA = vld1q(pSrcA);
+        vecSrcB = vld1q(pSrcB);
+
+        vst1q(pDst, veorq_u32(vecSrcA, vecSrcB) );
+
+        pSrcA += 4;
+        pSrcB += 4;
+        pDst  += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 3;
+
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp32q(blkCnt);
+        vecSrcA = vld1q(pSrcA);
+        vecSrcB = vld1q(pSrcB);
+        vstrwq_p(pDst, veorq_u32(vecSrcA, vecSrcB), p0);
+    }
+#else
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+    uint32x4_t vecA, vecB;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        vecA = vld1q_u32(pSrcA);
+        vecB = vld1q_u32(pSrcB);
+
+        vst1q_u32(pDst, veorq_u32(vecA, vecB) );
+
+        pSrcA += 4;
+        pSrcB += 4;
+        pDst  += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 3;
+#else
+    /* Initialize blkCnt with number of samples */
+    blkCnt = blockSize;
+#endif
+
+    while (blkCnt > 0U)
+    {
+        *pDst++ = (*pSrcA++)^(*pSrcB++);
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+#endif /* if defined(ARM_MATH_MVEI) */
+}
+
+/**
+  @} end of Xor group
+ */

+ 129 - 0
libraries/cmsis/dsp/Source/BasicMathFunctions/arm_xor_u8.c

@@ -0,0 +1,129 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_xor_u8.c
+ * Description:  uint8_t bitwise exclusive OR
+ *
+ * $Date:        14 November 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup Xor
+  @{
+ */
+
+/**
+  @brief         Compute the logical bitwise XOR of two fixed-point vectors.
+  @param[in]     pSrcA      points to input vector A
+  @param[in]     pSrcB      points to input vector B
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+void arm_xor_u8(
+    const uint8_t * pSrcA,
+    const uint8_t * pSrcB,
+          uint8_t * pDst,
+          uint32_t blockSize)
+{
+    uint32_t blkCnt;      /* Loop counter */
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+    q7x16_t vecSrcA, vecSrcB;
+
+    /* Compute 16 outputs at a time */
+    blkCnt = blockSize >> 4;
+
+    while (blkCnt > 0U)
+    {
+        vecSrcA = vld1q(pSrcA);
+        vecSrcB = vld1q(pSrcB);
+
+        vst1q(pDst, veorq_u8(vecSrcA, vecSrcB) );
+
+        pSrcA += 16;
+        pSrcB += 16;
+        pDst  += 16;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0xF;
+
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp8q(blkCnt);
+        vecSrcA = vld1q(pSrcA);
+        vecSrcB = vld1q(pSrcB);
+        vstrbq_p(pDst, veorq_u8(vecSrcA, vecSrcB), p0);
+    }
+#else
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+    uint8x16_t vecA, vecB;
+
+    /* Compute 16 outputs at a time */
+    blkCnt = blockSize >> 4U;
+
+    while (blkCnt > 0U)
+    {
+        vecA = vld1q_u8(pSrcA);
+        vecB = vld1q_u8(pSrcB);
+
+        vst1q_u8(pDst, veorq_u8(vecA, vecB) );
+
+        pSrcA += 16;
+        pSrcB += 16;
+        pDst  += 16;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0xF;
+#else
+    /* Initialize blkCnt with number of samples */
+    blkCnt = blockSize;
+#endif
+
+    while (blkCnt > 0U)
+    {
+        *pDst++ = (*pSrcA++)^(*pSrcB++);
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+#endif /* if defined(ARM_MATH_MVEI) */
+}
+
+/**
+  @} end of Xor group
+ */

+ 29 - 0
libraries/cmsis/dsp/Source/BayesFunctions/BayesFunctions.c

@@ -0,0 +1,29 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        BayesFunctions.c
+ * Description:  Combination of all bayes function source files.
+ *
+ * $Date:        16. March 2020
+ * $Revision:    V1.0.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2020 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_gaussian_naive_bayes_predict_f32.c"

+ 19 - 0
libraries/cmsis/dsp/Source/BayesFunctions/CMakeLists.txt

@@ -0,0 +1,19 @@
+cmake_minimum_required (VERSION 3.6)
+
+project(CMSISDSPBayes)
+
+include(configLib)
+include(configDsp)
+
+file(GLOB SRC "./*_*.c")
+
+add_library(CMSISDSPBayes STATIC ${SRC})
+
+configLib(CMSISDSPBayes ${ROOT})
+configDsp(CMSISDSPBayes ${ROOT})
+
+### Includes
+target_include_directories(CMSISDSPBayes PUBLIC "${DSP}/Include")
+
+
+

+ 397 - 0
libraries/cmsis/dsp/Source/BayesFunctions/arm_gaussian_naive_bayes_predict_f32.c

@@ -0,0 +1,397 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_naive_gaussian_bayes_predict_f32
+ * Description:  Naive Gaussian Bayesian Estimator
+ *
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+#include <limits.h>
+#include <math.h>
+
+#define PI_F 3.1415926535897932384626433832795f
+#define DPI_F (2.0f*3.1415926535897932384626433832795f)
+
+/**
+ * @addtogroup groupBayes
+ * @{
+ */
+
+/**
+ * @brief Naive Gaussian Bayesian Estimator
+ *
+ * @param[in]  *S         points to a naive bayes instance structure
+ * @param[in]  *in        points to the elements of the input vector.
+ * @param[in]  *pBuffer   points to a buffer of length numberOfClasses
+ * @return The predicted class
+ *
+ * @par If the number of classes is big, MVE version will consume lot of
+ * stack since the log prior are computed on the stack.
+ *
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_math.h"
+
+uint32_t arm_gaussian_naive_bayes_predict_f32(const arm_gaussian_naive_bayes_instance_f32 *S,
+   const float32_t * in,
+   float32_t *pBuffer)
+{
+    uint32_t         nbClass;
+    const float32_t *pTheta = S->theta;
+    const float32_t *pSigma = S->sigma;
+    float32_t      *buffer = pBuffer;
+    const float32_t *pIn = in;
+    float32_t       result;
+    f32x4_t         vsigma;
+    float32_t       tmp;
+    f32x4_t         vacc1, vacc2;
+    uint32_t        index;
+    float32_t       logclassPriors[S->numberOfClasses];
+    float32_t      *pLogPrior = logclassPriors;
+
+    arm_vlog_f32((float32_t *) S->classPriors, logclassPriors, S->numberOfClasses);
+
+    pTheta = S->theta;
+    pSigma = S->sigma;
+
+    for (nbClass = 0; nbClass < S->numberOfClasses; nbClass++) {
+        pIn = in;
+
+        vacc1 = vdupq_n_f32(0);
+        vacc2 = vdupq_n_f32(0);
+
+        uint32_t         blkCnt =S->vectorDimension >> 2;
+        while (blkCnt > 0U) {
+            f32x4_t         vinvSigma, vtmp;
+
+            vsigma = vaddq_n_f32(vld1q(pSigma), S->epsilon);
+            vacc1 = vaddq(vacc1, vlogq_f32(vmulq_n_f32(vsigma, 2.0f * PI)));
+
+            vinvSigma = vrecip_medprec_f32(vsigma);
+
+            vtmp = vsubq(vld1q(pIn), vld1q(pTheta));
+            /* squaring */
+            vtmp = vmulq(vtmp, vtmp);
+
+            vacc2 = vfmaq(vacc2, vtmp, vinvSigma);
+
+            pIn += 4;
+            pTheta += 4;
+            pSigma += 4;
+            blkCnt--;
+        }
+
+        blkCnt = S->vectorDimension & 3;
+        if (blkCnt > 0U) {
+            mve_pred16_t    p0 = vctp32q(blkCnt);
+            f32x4_t         vinvSigma, vtmp;
+
+            vsigma = vaddq_n_f32(vld1q(pSigma), S->epsilon);
+            vacc1 =
+                vaddq_m_f32(vacc1, vacc1, vlogq_f32(vmulq_n_f32(vsigma, 2.0f * PI)), p0);
+
+            vinvSigma = vrecip_medprec_f32(vsigma);
+
+            vtmp = vsubq(vld1q(pIn), vld1q(pTheta));
+            /* squaring */
+            vtmp = vmulq(vtmp, vtmp);
+
+            vacc2 = vfmaq_m_f32(vacc2, vtmp, vinvSigma, p0);
+
+            pTheta += blkCnt;
+            pSigma += blkCnt;
+        }
+
+        tmp = -0.5f * vecAddAcrossF32Mve(vacc1);
+        tmp -= 0.5f * vecAddAcrossF32Mve(vacc2);
+
+        *buffer = tmp + *pLogPrior++;
+        buffer++;
+    }
+
+    arm_max_f32(pBuffer, S->numberOfClasses, &result, &index);
+
+    return (index);
+}
+
+#else
+
+#if defined(ARM_MATH_NEON)
+
+#include "NEMath.h"
+
+
+
+uint32_t arm_gaussian_naive_bayes_predict_f32(const arm_gaussian_naive_bayes_instance_f32 *S,
+   const float32_t * in,
+   float32_t *pBuffer)
+{
+
+    const float32_t *pPrior = S->classPriors;
+
+    const float32_t *pTheta = S->theta;
+    const float32_t *pSigma = S->sigma;
+
+    const float32_t *pTheta1 = S->theta + S->vectorDimension;
+    const float32_t *pSigma1 = S->sigma + S->vectorDimension;
+
+    float32_t *buffer = pBuffer;
+    const float32_t *pIn=in;
+
+    float32_t result;
+    float32_t sigma,sigma1;
+    float32_t tmp,tmp1;
+    uint32_t index;
+    uint32_t vecBlkCnt;
+    uint32_t classBlkCnt;
+    float32x4_t epsilonV;
+    float32x4_t sigmaV,sigmaV1;
+    float32x4_t tmpV,tmpVb,tmpV1;
+    float32x2_t tmpV2;
+    float32x4_t thetaV,thetaV1;
+    float32x4_t inV;
+
+    epsilonV = vdupq_n_f32(S->epsilon);
+
+    classBlkCnt = S->numberOfClasses >> 1;
+    while(classBlkCnt > 0)
+    {
+
+
+        pIn = in;
+
+        tmp = logf(*pPrior++);
+        tmp1 = logf(*pPrior++);
+        tmpV = vdupq_n_f32(0.0f);
+        tmpV1 = vdupq_n_f32(0.0f);
+
+        vecBlkCnt = S->vectorDimension >> 2;
+        while(vecBlkCnt > 0)
+        {
+           sigmaV = vld1q_f32(pSigma);
+           thetaV = vld1q_f32(pTheta);
+
+           sigmaV1 = vld1q_f32(pSigma1);
+           thetaV1 = vld1q_f32(pTheta1);
+
+           inV = vld1q_f32(pIn);
+
+           sigmaV = vaddq_f32(sigmaV, epsilonV);
+           sigmaV1 = vaddq_f32(sigmaV1, epsilonV);
+
+           tmpVb = vmulq_n_f32(sigmaV,DPI_F);
+           tmpVb = vlogq_f32(tmpVb);
+           tmpV = vmlsq_n_f32(tmpV,tmpVb,0.5f);
+
+           tmpVb = vmulq_n_f32(sigmaV1,DPI_F);
+           tmpVb = vlogq_f32(tmpVb);
+           tmpV1 = vmlsq_n_f32(tmpV1,tmpVb,0.5f);
+
+           tmpVb = vsubq_f32(inV,thetaV);
+           tmpVb = vmulq_f32(tmpVb,tmpVb);
+           tmpVb = vmulq_f32(tmpVb, vinvq_f32(sigmaV));
+           tmpV = vmlsq_n_f32(tmpV,tmpVb,0.5f);
+
+           tmpVb = vsubq_f32(inV,thetaV1);
+           tmpVb = vmulq_f32(tmpVb,tmpVb);
+           tmpVb = vmulq_f32(tmpVb, vinvq_f32(sigmaV1));
+           tmpV1 = vmlsq_n_f32(tmpV1,tmpVb,0.5f);
+
+           pIn += 4;
+           pTheta += 4;
+           pSigma += 4;
+           pTheta1 += 4;
+           pSigma1 += 4;
+
+           vecBlkCnt--;
+        }
+        tmpV2 = vpadd_f32(vget_low_f32(tmpV),vget_high_f32(tmpV));
+        tmp += vget_lane_f32(tmpV2, 0) + vget_lane_f32(tmpV2, 1);
+
+        tmpV2 = vpadd_f32(vget_low_f32(tmpV1),vget_high_f32(tmpV1));
+        tmp1 += vget_lane_f32(tmpV2, 0) + vget_lane_f32(tmpV2, 1);
+
+        vecBlkCnt = S->vectorDimension & 3;
+        while(vecBlkCnt > 0)
+        {
+           sigma = *pSigma + S->epsilon;
+           sigma1 = *pSigma1 + S->epsilon;
+
+           tmp -= 0.5f*logf(2.0f * PI_F * sigma);
+           tmp -= 0.5f*(*pIn - *pTheta) * (*pIn - *pTheta) / sigma;
+
+           tmp1 -= 0.5f*logf(2.0f * PI_F * sigma1);
+           tmp1 -= 0.5f*(*pIn - *pTheta1) * (*pIn - *pTheta1) / sigma1;
+
+           pIn++;
+           pTheta++;
+           pSigma++;
+           pTheta1++;
+           pSigma1++;
+           vecBlkCnt--;
+        }
+
+        *buffer++ = tmp;
+        *buffer++ = tmp1;
+
+        pSigma += S->vectorDimension;
+        pTheta += S->vectorDimension;
+        pSigma1 += S->vectorDimension;
+        pTheta1 += S->vectorDimension;
+
+        classBlkCnt--;
+    }
+
+    classBlkCnt = S->numberOfClasses & 1;
+
+    while(classBlkCnt > 0)
+    {
+
+
+        pIn = in;
+
+        tmp = logf(*pPrior++);
+        tmpV = vdupq_n_f32(0.0f);
+
+        vecBlkCnt = S->vectorDimension >> 2;
+        while(vecBlkCnt > 0)
+        {
+           sigmaV = vld1q_f32(pSigma);
+           thetaV = vld1q_f32(pTheta);
+           inV = vld1q_f32(pIn);
+
+           sigmaV = vaddq_f32(sigmaV, epsilonV);
+
+           tmpVb = vmulq_n_f32(sigmaV,DPI_F);
+           tmpVb = vlogq_f32(tmpVb);
+           tmpV = vmlsq_n_f32(tmpV,tmpVb,0.5f);
+
+           tmpVb = vsubq_f32(inV,thetaV);
+           tmpVb = vmulq_f32(tmpVb,tmpVb);
+           tmpVb = vmulq_f32(tmpVb, vinvq_f32(sigmaV));
+           tmpV = vmlsq_n_f32(tmpV,tmpVb,0.5f);
+
+           pIn += 4;
+           pTheta += 4;
+           pSigma += 4;
+
+           vecBlkCnt--;
+        }
+        tmpV2 = vpadd_f32(vget_low_f32(tmpV),vget_high_f32(tmpV));
+        tmp += vget_lane_f32(tmpV2, 0) + vget_lane_f32(tmpV2, 1);
+
+        vecBlkCnt = S->vectorDimension & 3;
+        while(vecBlkCnt > 0)
+        {
+           sigma = *pSigma + S->epsilon;
+           tmp -= 0.5f*logf(2.0f * PI_F * sigma);
+           tmp -= 0.5f*(*pIn - *pTheta) * (*pIn - *pTheta) / sigma;
+
+           pIn++;
+           pTheta++;
+           pSigma++;
+           vecBlkCnt--;
+        }
+
+        *buffer++ = tmp;
+
+        classBlkCnt--;
+    }
+
+    arm_max_f32(pBuffer,S->numberOfClasses,&result,&index);
+
+    return(index);
+}
+
+#else
+
+/**
+ * @brief Naive Gaussian Bayesian Estimator
+ *
+ * @param[in]  *S         points to a naive bayes instance structure
+ * @param[in]  *in        points to the elements of the input vector.
+ * @param[in]  *pBuffer   points to a buffer of length numberOfClasses
+ * @return The predicted class
+ *
+ */
+uint32_t arm_gaussian_naive_bayes_predict_f32(const arm_gaussian_naive_bayes_instance_f32 *S,
+   const float32_t * in,
+   float32_t *pBuffer)
+{
+    uint32_t nbClass;
+    uint32_t nbDim;
+    const float32_t *pPrior = S->classPriors;
+    const float32_t *pTheta = S->theta;
+    const float32_t *pSigma = S->sigma;
+    float32_t *buffer = pBuffer;
+    const float32_t *pIn=in;
+    float32_t result;
+    float32_t sigma;
+    float32_t tmp;
+    float32_t acc1,acc2;
+    uint32_t index;
+
+    pTheta=S->theta;
+    pSigma=S->sigma;
+
+    for(nbClass = 0; nbClass < S->numberOfClasses; nbClass++)
+    {
+
+
+        pIn = in;
+
+        tmp = 0.0;
+        acc1 = 0.0f;
+        acc2 = 0.0f;
+        for(nbDim = 0; nbDim < S->vectorDimension; nbDim++)
+        {
+           sigma = *pSigma + S->epsilon;
+           acc1 += logf(2.0f * PI_F * sigma);
+           acc2 += (*pIn - *pTheta) * (*pIn - *pTheta) / sigma;
+
+           pIn++;
+           pTheta++;
+           pSigma++;
+        }
+
+        tmp = -0.5f * acc1;
+        tmp -= 0.5f * acc2;
+
+
+        *buffer = tmp + logf(*pPrior++);
+        buffer++;
+    }
+
+    arm_max_f32(pBuffer,S->numberOfClasses,&result,&index);
+
+    return(index);
+}
+
+#endif
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of groupBayes group
+ */

+ 280 - 0
libraries/cmsis/dsp/Source/CMakeLists.txt

@@ -0,0 +1,280 @@
+cmake_minimum_required (VERSION 3.6)
+cmake_policy(SET CMP0077 NEW)
+project(CMSISDSP)
+
+# DSP Sources
+SET(DSP ${ROOT}/CMSIS/DSP)
+
+list(APPEND CMAKE_MODULE_PATH ${DSP}/Source)
+list(APPEND CMAKE_MODULE_PATH ${DSP})
+
+
+include(configLib)
+
+
+option(NEON "Neon acceleration" OFF)
+option(NEONEXPERIMENTAL "Neon experimental acceleration" OFF)
+option(LOOPUNROLL "Loop unrolling" ON)
+option(ROUNDING "Rounding" OFF)
+option(MATRIXCHECK "Matrix Checks" OFF)
+option(HELIUM "Helium acceleration (MVEF and MVEI supported)" OFF)
+option(MVEF "MVEF intrinsics supported" OFF)
+option(MVEI "MVEI intrinsics supported" OFF)
+
+# Select which parts of the CMSIS-DSP must be compiled.
+# There are some dependencies between the parts but they are not tracked
+# by this cmake. So, enabling some functions may require to enable some
+# other ones.
+option(BASICMATH            "Basic Math Functions"              ON)
+option(COMPLEXMATH          "Complex Math Functions"            ON)
+option(CONTROLLER           "Controller Functions"              ON)
+option(FASTMATH             "Fast Math Functions"               ON)
+option(FILTERING            "Filtering Functions"               ON)
+option(MATRIX               "Matrix Functions"                  ON)
+option(STATISTICS           "Statistics Functions"              ON)
+option(SUPPORT              "Support Functions"                 ON)
+option(TRANSFORM            "Transform Functions"               ON)
+option(SVM                  "Support Vector Machine Functions"  ON)
+option(BAYES                "Bayesian Estimators"               ON)
+option(DISTANCE             "Distance Functions"                ON)
+
+# When OFF it is the default behavior : all tables are included.
+option(CONFIGTABLE          "Configuration of table allowed"    OFF)
+
+# When CONFIGTABLE is ON, select if all interpolation tables must be included
+option(ALLFAST              "All interpolation tables included" OFF)
+# When CONFIGTABLE is ON, select if all FFT tables must be included
+option(ALLFFT               "All fft tables included"           OFF)
+
+# Features which require inclusion of a data table.
+# Since some tables may be big, the corresponding feature can be
+# disabled.
+# Those options are taken into account only when CONFIGTABLE is ON
+option(ARM_COS_F32          "cos f32"                           OFF)
+option(ARM_COS_Q31          "cos q31"                           OFF)
+option(ARM_COS_Q15          "cos q15"                           OFF)
+option(ARM_SIN_F32          "sin f32"                           OFF)
+option(ARM_SIN_Q31          "sin q31"                           OFF)
+option(ARM_SIN_Q15          "sin q15"                           OFF)
+option(ARM_SIN_COS_F32      "sin cos f32"                       OFF)
+option(ARM_SIN_COS_Q31      "sin cos q31"                       OFF)
+
+option(ARM_LMS_NORM_Q31     "lms norm q31"                      OFF)
+option(ARM_LMS_NORM_Q15     "lms norm q15"                      OFF)
+
+option(CFFT_F64_16          "cfft f64 16"                       OFF)
+option(CFFT_F64_32          "cfft f64 32"                       OFF)
+option(CFFT_F64_64          "cfft f64 64"                       OFF)
+option(CFFT_F64_128         "cfft f64 128"                      OFF)
+option(CFFT_F64_256         "cfft f64 256"                      OFF)
+option(CFFT_F64_512         "cfft f64 512"                      OFF)
+option(CFFT_F64_1024        "cfft f64 1024"                     OFF)
+option(CFFT_F64_2048        "cfft f64 2048"                     OFF)
+option(CFFT_F64_4096        "cfft f64 4096"                     OFF)
+
+option(CFFT_F32_16          "cfft f32 16"                       OFF)
+option(CFFT_F32_32          "cfft f32 32"                       OFF)
+option(CFFT_F32_64          "cfft f32 64"                       OFF)
+option(CFFT_F32_128         "cfft f32 128"                      OFF)
+option(CFFT_F32_256         "cfft f32 256"                      OFF)
+option(CFFT_F32_512         "cfft f32 512"                      OFF)
+option(CFFT_F32_1024        "cfft f32 1024"                     OFF)
+option(CFFT_F32_2048        "cfft f32 2048"                     OFF)
+option(CFFT_F32_4096        "cfft f32 4096"                     OFF)
+
+option(CFFT_Q31_16          "cfft q31 16"                       OFF)
+option(CFFT_Q31_32          "cfft q31 32"                       OFF)
+option(CFFT_Q31_64          "cfft q31 64"                       OFF)
+option(CFFT_Q31_128         "cfft q31 128"                      OFF)
+option(CFFT_Q31_256         "cfft q31 256"                      OFF)
+option(CFFT_Q31_512         "cfft q31 512"                      OFF)
+option(CFFT_Q31_1024        "cfft q31 1024"                     OFF)
+option(CFFT_Q31_2048        "cfft q31 2048"                     OFF)
+option(CFFT_Q31_4096        "cfft q31 4096"                     OFF)
+
+option(CFFT_Q15_16          "cfft q15 16"                       OFF)
+option(CFFT_Q15_32          "cfft q15 32"                       OFF)
+option(CFFT_Q15_64          "cfft q15 64"                       OFF)
+option(CFFT_Q15_128         "cfft q15 128"                      OFF)
+option(CFFT_Q15_256         "cfft q15 256"                      OFF)
+option(CFFT_Q15_512         "cfft q15 512"                      OFF)
+option(CFFT_Q15_1024        "cfft q15 1024"                     OFF)
+option(CFFT_Q15_2048        "cfft q15 2048"                     OFF)
+option(CFFT_Q15_4096        "cfft q15 4096"                     OFF)
+
+option(RFFT_FAST_F32_32     "rfft fast f32 32"                  OFF)
+option(RFFT_FAST_F32_64     "rfft fast f32 64"                  OFF)
+option(RFFT_FAST_F32_128    "rfft fast f32 128"                 OFF)
+option(RFFT_FAST_F32_256    "rfft fast f32 256"                 OFF)
+option(RFFT_FAST_F32_512    "rfft fast f32 512"                 OFF)
+option(RFFT_FAST_F32_1024   "rfft fast f32 1024"                OFF)
+option(RFFT_FAST_F32_2048   "rfft fast f32 2048"                OFF)
+option(RFFT_FAST_F32_4096   "rfft fast f32 4096"                OFF)
+
+
+option(RFFT_F32_128         "rfft f32 128"                      OFF)
+option(RFFT_F32_512         "rfft f32 512"                      OFF)
+option(RFFT_F32_2048        "rfft f32 2048"                     OFF)
+option(RFFT_F32_8192        "rfft f32 8192"                     OFF)
+
+option(RFFT_FAST_F64_32     "rfft fast f64 32"                  OFF)
+option(RFFT_FAST_F64_64     "rfft fast f64 64"                  OFF)
+option(RFFT_FAST_F64_128    "rfft fast f64 128"                 OFF)
+option(RFFT_FAST_F64_256    "rfft fast f64 256"                 OFF)
+option(RFFT_FAST_F64_512    "rfft fast f64 512"                 OFF)
+option(RFFT_FAST_F64_1024   "rfft fast f64 1024"                OFF)
+option(RFFT_FAST_F64_2048   "rfft fast f64 2048"                OFF)
+option(RFFT_FAST_F64_4096   "rfft fast f64 4096"                OFF)
+
+
+option(RFFT_F64_128         "rfft f64 128"                      OFF)
+option(RFFT_F64_512         "rfft f64 512"                      OFF)
+option(RFFT_F64_2048        "rfft f64 2048"                     OFF)
+option(RFFT_F64_8192        "rfft f64 8192"                     OFF)
+
+option(RFFT_Q31_32          "rfft q31 32"                       OFF)
+option(RFFT_Q31_64          "rfft q31 64"                       OFF)
+option(RFFT_Q31_128         "rfft q31 128"                      OFF)
+option(RFFT_Q31_256         "rfft q31 256"                      OFF)
+option(RFFT_Q31_512         "rfft q31 512"                      OFF)
+option(RFFT_Q31_1024        "rfft q31 1024"                     OFF)
+option(RFFT_Q31_2048        "rfft q31 2048"                     OFF)
+option(RFFT_Q31_4096        "rfft q31 4096"                     OFF)
+option(RFFT_Q31_8192        "rfft q31 8192"                     OFF)
+
+option(RFFT_Q15_32          "rfft q15 32"                       OFF)
+option(RFFT_Q15_64          "rfft q15 64"                       OFF)
+option(RFFT_Q15_128         "rfft q15 128"                      OFF)
+option(RFFT_Q15_256         "rfft q15 256"                      OFF)
+option(RFFT_Q15_512         "rfft q15 512"                      OFF)
+option(RFFT_Q15_1024        "rfft q15 1024"                     OFF)
+option(RFFT_Q15_2048        "rfft q15 2048"                     OFF)
+option(RFFT_Q15_4096        "rfft q15 4096"                     OFF)
+option(RFFT_Q15_8192        "rfft q15 8192"                     OFF)
+
+option(DCT4_F32_128          "dct4 f32 128"                     OFF)
+option(DCT4_F32_512          "dct4 f32 512"                     OFF)
+option(DCT4_F32_2048         "dct4 f32 2048"                    OFF)
+option(DCT4_F32_8192         "dct4 f32 8192"                    OFF)
+
+option(DCT4_Q31_128          "dct4 q31 128"                     OFF)
+option(DCT4_Q31_512          "dct4 q31 512"                     OFF)
+option(DCT4_Q31_2048         "dct4 q31 2048"                    OFF)
+option(DCT4_Q31_8192         "dct4 q31 8192"                    OFF)
+
+option(DCT4_Q15_128          "dct4 q15 128"                     OFF)
+option(DCT4_Q15_512          "dct4 q15 512"                     OFF)
+option(DCT4_Q15_2048         "dct4 q15 2048"                    OFF)
+option(DCT4_Q15_8192         "dct4 q15 8192"                    OFF)
+
+
+###########################
+#
+# CMSIS DSP
+#
+###########################
+
+
+
+add_library(CMSISDSP INTERFACE)
+
+if (BASICMATH)
+  add_subdirectory(BasicMathFunctions)
+  target_link_libraries(CMSISDSP INTERFACE CMSISDSPBasicMath)
+endif()
+
+if (COMPLEXMATH)
+  add_subdirectory(ComplexMathFunctions)
+  target_link_libraries(CMSISDSP INTERFACE CMSISDSPComplexMath)
+endif()
+
+if (CONTROLLER)
+  add_subdirectory(ControllerFunctions)
+  # Fast tables inclusion is allowed
+  if (CONFIGTABLE)
+    target_compile_definitions(CMSISDSPController PUBLIC ARM_FAST_ALLOW_TABLES)
+  endif()
+  target_link_libraries(CMSISDSP INTERFACE CMSISDSPController)
+endif()
+
+if (FASTMATH)
+  add_subdirectory(FastMathFunctions)
+  # Fast tables inclusion is allowed
+  if (CONFIGTABLE)
+    target_compile_definitions(CMSISDSPFastMath PUBLIC ARM_FAST_ALLOW_TABLES)
+  endif()
+  target_link_libraries(CMSISDSP INTERFACE CMSISDSPFastMath)
+endif()
+
+if (FILTERING)
+  add_subdirectory(FilteringFunctions)
+  # Fast tables inclusion is allowed
+  if (CONFIGTABLE)
+    target_compile_definitions(CMSISDSPFiltering PUBLIC ARM_FAST_ALLOW_TABLES)
+  endif()
+  target_link_libraries(CMSISDSP INTERFACE CMSISDSPFiltering)
+endif()
+
+if (MATRIX)
+  add_subdirectory(MatrixFunctions)
+  target_link_libraries(CMSISDSP INTERFACE CMSISDSPMatrix)
+endif()
+
+if (STATISTICS)
+  add_subdirectory(StatisticsFunctions)
+  target_link_libraries(CMSISDSP INTERFACE CMSISDSPStatistics)
+endif()
+
+if (SUPPORT)
+  add_subdirectory(SupportFunctions)
+  target_link_libraries(CMSISDSP INTERFACE CMSISDSPSupport)
+endif()
+
+if (TRANSFORM)
+  add_subdirectory(TransformFunctions)
+  # FFT tables inclusion is allowed
+  if (CONFIGTABLE)
+    target_compile_definitions(CMSISDSPTransform PUBLIC ARM_FFT_ALLOW_TABLES)
+  endif()
+  target_link_libraries(CMSISDSP INTERFACE CMSISDSPTransform)
+endif()
+
+if (FILTERING OR CONTROLLER OR FASTMATH OR TRANSFORM OR SVM OR DISTANCE)
+  add_subdirectory(CommonTables)
+  if (TRANSFORM)
+    # FFT tables inclusion is allowed
+    if (CONFIGTABLE)
+      target_compile_definitions(CMSISDSPCommon PUBLIC ARM_FFT_ALLOW_TABLES)
+    endif()
+  endif()
+  if (FILTERING OR CONTROLLER OR FASTMATH)
+    # Select which tables to include
+    if (CONFIGTABLE)
+      target_compile_definitions(CMSISDSPCommon PUBLIC ARM_FAST_ALLOW_TABLES)
+    endif()
+  endif()
+  target_link_libraries(CMSISDSP INTERFACE CMSISDSPCommon)
+  # Common project is adding ComputeLibrary tables used by SVM and Distance
+  # when NEon is ON.
+endif()
+
+if (SVM)
+  add_subdirectory(SVMFunctions)
+  target_link_libraries(CMSISDSP INTERFACE CMSISDSPSVM)
+endif()
+
+if (BAYES)
+  add_subdirectory(BayesFunctions)
+  target_link_libraries(CMSISDSP INTERFACE CMSISDSPBayes)
+endif()
+
+if (DISTANCE)
+  add_subdirectory(DistanceFunctions)
+  target_link_libraries(CMSISDSP INTERFACE CMSISDSPDistance)
+endif()
+
+### Includes
+target_include_directories(CMSISDSP INTERFACE "${DSP}/Include")
+
+
+

Algúns arquivos non se mostraron porque demasiados arquivos cambiaron neste cambio