Embedded Performance Engineering · 17/57

SIMD·NEON 활용 — 128-bit Vector·Auto-Vectorization·SVE/SVE2

2026년 4월 24일 · Hawk · 4분 읽기

simd neon sve helium intrinsics

Twitter LinkedIn

#한 줄 요약

**“SIMD = 한 명령으로 여러 데이터”**입니다. 4배 speedup이 흔합니다.

#ARM NEON — 128-bit Vector

Cortex-A 시리즈의 표준입니다. 128-bit register 32개(v0-v31)를 제공합니다.

NEON 128-bit register의 다양한 해석 — 4xf32, 8xi16, 16xi8 등

#Auto-Vectorization — 첫 시도

1
gcc -O3 -mfpu=neon -ftree-vectorize -ftree-vectorizer-verbose=2 source.c

1
void scale(float *a, float k, int N) {
2
    for (int i = 0; i < N; i++) {
3
        a[i] *= k;
4
    }
5
}

-O3를 켜면 컴파일러가 자동으로 NEON fmul.f32 q0, q1, q2로 4-way 처리합니다.

조건은 다음과 같습니다.

명확한 stride가 있어야 합니다 (보통 1).
Alias가 없어야 합니다 (restrict 키워드가 도움이 됩니다).
Branch가 없어야 합니다.
길이가 vector width의 배수이거나 epilogue로 처리 가능해야 합니다.

#restrict로 vectorizer 도움

1
// 회피
2
void add(float *a, float *b, float *c, int N) {
3
    for (int i = 0; i < N; i++) c[i] = a[i] + b[i];
4
    // 컴파일러: a와 c가 alias할 수도 → vector 못 함
5
}
6

7
// Good
8
void add(float * restrict a, float * restrict b, float * restrict c, int N) {
9
    for (int i = 0; i < N; i++) c[i] = a[i] + b[i];
10
}

#NEON Intrinsics로 직접 작성

1
#include <arm_neon.h>
2

3
void add_neon(float *a, float *b, float *c, int N) {
4
    int i;
5
    for (i = 0; i + 4 <= N; i += 4) {
6
        float32x4_t va = vld1q_f32(&a[i]);
7
        float32x4_t vb = vld1q_f32(&b[i]);
8
        float32x4_t vc = vaddq_f32(va, vb);
9
        vst1q_f32(&c[i], vc);
10
    }
11
    /* Tail */
12
    for (; i < N; i++) c[i] = a[i] + b[i];
13
}

자주 쓰는 intrinsic은 다음과 같습니다.

Intrinsic	동작
`vld1q_f32`	4 float load
`vst1q_f32`	4 float store
`vaddq_f32`	4 float add
`vmulq_f32`	4 float mul
`vfmaq_f32`	fused multiply-add
`vdupq_n_f32`	scalar → 4 element broadcast

#실전 — Dot Product

1
float dot(const float *a, const float *b, int N) {
2
    float32x4_t sum = vdupq_n_f32(0.0f);
3
    int i;
4
    for (i = 0; i + 4 <= N; i += 4) {
5
        float32x4_t va = vld1q_f32(&a[i]);
6
        float32x4_t vb = vld1q_f32(&b[i]);
7
        sum = vfmaq_f32(sum, va, vb);   // sum += a * b
8
    }
9
    /* Horizontal sum */
10
    float32x2_t h = vadd_f32(vget_low_f32(sum), vget_high_f32(sum));
11
    h = vpadd_f32(h, h);
12
    float result = vget_lane_f32(h, 0);
13

14
    for (; i < N; i++) result += a[i] * b[i];
15
    return result;
16
}

Scalar 대비 3-4배 빠릅니다.

#Helium (MVE) — Cortex-M 용 SIMD

Cortex-M55와 M85에서는 MVE(M-profile Vector Extension)를 제공합니다.

1
#include <arm_mve.h>
2

3
void add_mve(int16_t *a, int16_t *b, int16_t *c, int N) {
4
    for (int i = 0; i < N; i += 8) {
5
        int16x8_t va = vld1q(&a[i]);
6
        int16x8_t vb = vld1q(&b[i]);
7
        int16x8_t vc = vaddq(va, vb);
8
        vst1q(&c[i], vc);
9
    }
10
}

NEON과 다른 점은 다음과 같습니다.

Beat scheme: 4 beat를 한 cycle씩 처리합니다 (low power).
Predication: tail handling을 자동으로 처리합니다.
레지스터가 8개뿐입니다 (NEON은 32개).

주로 DSP, 오디오, ML inference에 사용합니다.

#SVE — 가변폭 SIMD

Cortex-A510·A78·X1 등에 SVE 또는 SVE2가 들어 있습니다. 폭은 구현에 따라 128 ~ 2048 bit으로 다양합니다.

1
#include <arm_sve.h>
2

3
void add_sve(float *a, float *b, float *c, int N) {
4
    int i = 0;
5
    svbool_t pg = svwhilelt_b32(i, N);
6
    while (svptest_first(svptrue_b32(), pg)) {
7
        svfloat32_t va = svld1(pg, &a[i]);
8
        svfloat32_t vb = svld1(pg, &b[i]);
9
        svst1(pg, &c[i], svadd_z(pg, va, vb));
10
        i += svcntw();
11
        pg = svwhilelt_b32(i, N);
12
    }
13
}

Predication(mask)으로 tail handling이 자동으로 됩니다. 길이를 모르는 loop도 안전합니다.

같은 binary가 128-bit, 256-bit SVE 양쪽에서 그대로 동작합니다.

#측정 — IPC와 Throughput

1
// Scalar
2
for (i = 0; i < N; i++) c[i] = a[i] + b[i];
3
// → 1 add per cycle (Cortex-M)
4

5
// NEON
6
float32x4_t va = vld1q_f32(...);
7
// → 4 add per cycle
8

9
// 이론 4x — 실측 3.2-3.8x (load/store가 병목)

perf 또는 DWT CYCCNT로 측정합니다.

#Reduction Pattern

1
// 회피 — RAW chain
2
float sum = 0;
3
for (i = 0; i < N; i++) sum += a[i];   // 1 add/cycle (RAW)
4

5
// Good — 4-way reduction
6
float32x4_t acc0 = vdupq_n_f32(0);
7
float32x4_t acc1 = vdupq_n_f32(0);
8
float32x4_t acc2 = vdupq_n_f32(0);
9
float32x4_t acc3 = vdupq_n_f32(0);
10

11
for (i = 0; i + 16 <= N; i += 16) {
12
    acc0 = vaddq_f32(acc0, vld1q_f32(&a[i]));
13
    acc1 = vaddq_f32(acc1, vld1q_f32(&a[i+4]));
14
    acc2 = vaddq_f32(acc2, vld1q_f32(&a[i+8]));
15
    acc3 = vaddq_f32(acc3, vld1q_f32(&a[i+12]));
16
}
17
/* 16-way ILP — load + add latency 가림 */

#Memory Alignment

1
__attribute__((aligned(16))) float a[1024];
2
float32x4_t v = vld1q_f32(a);   // ← aligned load 빠름

NEON은 misaligned 접근도 가능하지만, 정렬해 두면 10-20% 더 빠릅니다. Cortex-M MVE는 정렬을 권장합니다.

#SIMD 적용 어려운 경우

Branch가 많은 경우: predicated 명령으로 회피합니다.
Indirect access: gather/scatter를 써야 하는데 SVE2만 지원합니다.
Cross-element dependency: prefix sum 같은 recurrence가 어렵습니다.
Bit-level operation: bit manipulation은 vector에 친화적이지 않습니다.

#자주 하는 실수

⚠️ Auto-vectorize 신뢰

gcc -O3라고 해서 항상 vectorize되는 것은 아닙니다. -fopt-info-vec로 확인합니다.

1
gcc -O3 -fopt-info-vec -c src.c
2
# loop vectorized using 16 byte vectors    ← 성공
3
# loop turned into non-loop                 ← 다른 최적화

Vectorize가 안 되었다면 intrinsics를 쓰거나 OpenMP #pragma omp simd를 적용합니다.

⚠️ Tail handling 누락

1
for (i = 0; i < N; i += 4) {   // N=10이면 i=8까지 → tail 2 남음
2
    process_4(arr + i);
3
}

tail은 scalar로 처리하거나 SVE predicate으로 마무리합니다.

⚠️ Mixed precision 무시

1
int16_t a[N]; float b[N];
2
for (i) b[i] = (float)a[i] * 2.0f;   // ← conversion 비쌈

NEON vcvtq_f32_s16를 명시하거나, 아예 fixed-point로 유지합니다.

⚠️ FP exception 가정

NEON에서는 NaN/Inf 동작이 IEEE-754 flush-to-zero 모드일 수 있습니다. 정밀 수치 코드를 다룬다면 주의가 필요합니다.

#정리

ARM NEON은 128-bit, 4 × float 구조입니다.
Auto-vectorize는 조건이 까다롭습니다. -O3 -ftree-vectorize와 restrict를 함께 씁니다.
Intrinsics로 직접 작성하면 확실하게 통제할 수 있습니다.
Cortex-M55 이상은 Helium MVE를 지원합니다.
모던 Cortex-A는 SVE/SVE2로 가변폭을 제공합니다.
Reduction은 multiple accumulator로 RAW chain을 회피합니다.

다음 편에서는 PMU를 다룹니다.

#관련 항목

Embedded Performance Engineering · 18 of 57

실전 사례 — CXL.mem 추가로 LLM inference KV cache 처리량 회복

70B 모델 KV cache가 HBM 한계를 넘어 throughput이 무너졌을 때, CXL.mem 256 GB pool 추가로 회복한 실전 케이스.

2026년 6월 16일·cxl

CXL 성능 프로파일링 도구 — cxl-cli·DAMON·perf-mem 활용

CXL.mem 환경 성능 도구 — cxl-cli 토폴로지·DAMON page activity·perf-mem로 보는 CXL 트래픽·numastat 통계.

2026년 6월 16일·cxl

CXL.mem 지연·대역폭 실측 — Direct·Switch·Pooled 토폴로지 비교

CXL.mem 토폴로지별 실측 — Direct attach·Single switch·Multi-host pool의 지연·대역폭 비용 측정.

2026년 6월 16일·cxl

SIMD·NEON 활용 — 128-bit Vector·Auto-Vectorization·SVE/SVE2

#한 줄 요약

#ARM NEON — 128-bit Vector

#Auto-Vectorization — 첫 시도

#restrict로 vectorizer 도움

#NEON Intrinsics로 직접 작성

#실전 — Dot Product

#Helium (MVE) — Cortex-M 용 SIMD

#SVE — 가변폭 SIMD

#측정 — IPC와 Throughput

#Reduction Pattern

#Memory Alignment

#SIMD 적용 어려운 경우

#자주 하는 실수

#정리

#관련 항목

Embedded Performance Engineering · 18 of 57

관련 글

실전 사례 — CXL.mem 추가로 LLM inference KV cache 처리량 회복

CXL 성능 프로파일링 도구 — cxl-cli·DAMON·perf-mem 활용

CXL.mem 지연·대역폭 실측 — Direct·Switch·Pooled 토폴로지 비교

이 글을 참조하는 글 (4)