Modern Embedded Recipes · 145/152

Zero-Copy Camera Pipeline — V4L2·DMA-BUF·GPU Import·NPU 직결

2026년 4월 21일 · Hawk · 6분 읽기

recipes camera v4l2 dma-buf zero-copy isp libcamera

#한 줄 요약

“Zero-copy camera = 한 frame이 한 physical page를 유지하며 ISP·GPU·NPU·display를 거치는 것입니다.” 1080p × 60 fps에 4~6번 copy하면 4.5 GB/s 메모리 대역폭을 그냥 흘려보냅니다. DMA-BUF로 묶으면 같은 work를 30 fps가 아니라 60 fps로 처리할 수 있습니다.

#어떤 상황에서 쓰나

자율주행 8-camera vision, 카메라 다중 입력 NVR, drone real-time detection, 산업용 inspection처럼 카메라 → 추론 → 출력이 frame-rate에 묶이는 모든 경우가 후보입니다.

문제는 naive 구현이 너무 자주 일어난다는 점입니다. v4l2src ! videoconvert ! appsink로 GStreamer pipeline을 짜면 매 stage가 user memory를 copy하고 format conversion까지 합니다. 1080p NV12 한 frame이 ~3 MB라서 60 fps × 6 copy = 1.1 GB/s가 낭비됩니다. Memory bandwidth는 edge SoC에서 가장 빠듯한 자원입니다.

DMA-BUF는 Linux kernel의 cross-driver buffer sharing mechanism입니다. V4L2(camera) · DRM(display) · GPU · NPU driver가 같은 physical page를 가리키게 만들어 copy 자체를 없앱니다.

#핵심 개념

Camera부터 display까지 한 frame이 한 physical page를 유지하는 모습을 그림으로 정리합니다.

Zero-copy camera pipeline — DMA-BUF fd로 묶인 한 page

DMA-BUF는 file descriptor로 buffer를 share합니다.

1
Producer 측 (예 V4L2 camera driver)
2
  ↓ VIDIOC_EXPBUF
3
  fd (file descriptor) 발급
4
  ↓
5
Consumer 측 (예 EGL / CUDA / VAAPI)
6
  ↓ eglCreateImageKHR / cudaImportExternalMemory
7
  same physical page를 자기 driver의 handle로 mapping

fd 한 개가 cross-driver permit이 됩니다. Refcount는 kernel이 관리합니다.

V4L2는 buffer 관리 방식이 세 가지입니다.

Mode	동작
`V4L2_MEMORY_MMAP`	driver 측 buffer를 user에 mmap (copy 가능)
`V4L2_MEMORY_USERPTR`	user 측 buffer를 driver에 등록
`V4L2_MEMORY_DMABUF`	외부 DMA-BUF fd를 buffer로 사용 (zero-copy)

DMABUF mode가 핵심입니다. Camera가 ISP DMA로 직접 write한 page를 그대로 GPU·NPU가 read합니다.

NVIDIA Jetson은 한 단계 더 추상화한 *NVMM (NV Memory Manager)*을 씁니다. GStreamer caps에 (memory:NVMM)이 붙으면 pipeline 전체가 NVMM/DMA-BUF로 zero-copy됩니다.

#코드 / 실제 사용 예

#V4L2 DMA-BUF 요청

1
int cam = open("/dev/video0", O_RDWR);
2

3
struct v4l2_format fmt = {
4
    .type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE,
5
    .fmt.pix_mp = {
6
        .width = 1920, .height = 1080,
7
        .pixelformat = V4L2_PIX_FMT_NV12,
8
        .num_planes = 2,
9
    },
10
};
11
ioctl(cam, VIDIOC_S_FMT, &fmt);
12

13
struct v4l2_requestbuffers req = {
14
    .count  = 4,
15
    .type   = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE,
16
    .memory = V4L2_MEMORY_DMABUF,
17
};
18
ioctl(cam, VIDIOC_REQBUFS, &req);
19

20
int dma_fds[4];
21
for (int i = 0; i < 4; i++) {
22
    struct v4l2_exportbuffer exp = {
23
        .type  = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE,
24
        .index = i,
25
    };
26
    ioctl(cam, VIDIOC_EXPBUF, &exp);
27
    dma_fds[i] = exp.fd;
28
}

dma_fds[]가 cross-driver share용 fd입니다.

#EGL import — OpenGL ES texture

1
EGLint attrs[] = {
2
    EGL_WIDTH,                     1920,
3
    EGL_HEIGHT,                    1080,
4
    EGL_LINUX_DRM_FOURCC_EXT,      DRM_FORMAT_NV12,
5
    EGL_DMA_BUF_PLANE0_FD_EXT,     dma_fd,
6
    EGL_DMA_BUF_PLANE0_OFFSET_EXT, 0,
7
    EGL_DMA_BUF_PLANE0_PITCH_EXT,  1920,
8
    EGL_DMA_BUF_PLANE1_FD_EXT,     dma_fd,
9
    EGL_DMA_BUF_PLANE1_OFFSET_EXT, 1920 * 1080,
10
    EGL_DMA_BUF_PLANE1_PITCH_EXT,  1920,
11
    EGL_NONE,
12
};
13
EGLImageKHR image = eglCreateImageKHR(
14
    egl_display, EGL_NO_CONTEXT,
15
    EGL_LINUX_DMA_BUF_EXT, NULL, attrs);
16

17
GLuint tex;
18
glGenTextures(1, &tex);
19
glBindTexture(GL_TEXTURE_EXTERNAL_OES, tex);
20
glEGLImageTargetTexture2DOES(GL_TEXTURE_EXTERNAL_OES, image);

Camera DMA-BUF가 GLES texture로 직접 매핑됩니다. Shader가 같은 physical page를 read합니다.

#CUDA import — Jetson

1
cudaExternalMemoryHandleDesc desc = {
2
    .type = cudaExternalMemoryHandleTypeOpaqueFd,
3
    .handle.fd = dma_fd,
4
    .size = 1920 * 1080 * 3 / 2,
5
};
6
cudaExternalMemory_t ext_mem;
7
cudaImportExternalMemory(&ext_mem, &desc);
8

9
cudaExternalMemoryBufferDesc buf_desc = {
10
    .offset = 0,
11
    .size   = 1920 * 1080 * 3 / 2,
12
};
13
void *device_ptr;
14
cudaExternalMemoryGetMappedBuffer(&device_ptr, ext_mem, &buf_desc);
15

16
/* device_ptr를 TensorRT setTensorAddress에 그대로 줄 수 있음 */
17
ctx->setTensorAddress("input", device_ptr);
18
ctx->enqueueV3(stream);

Camera → NPU 사이에 copy가 한 번도 없습니다.

#Capture loop

1
for (int i = 0; i < 4; i++) {
2
    struct v4l2_buffer buf = {
3
        .type   = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE,
4
        .memory = V4L2_MEMORY_DMABUF,
5
        .index  = i,
6
        .m.fd   = dma_fds[i],
7
    };
8
    ioctl(cam, VIDIOC_QBUF, &buf);
9
}
10

11
int type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE;
12
ioctl(cam, VIDIOC_STREAMON, &type);
13

14
while (running) {
15
    struct v4l2_buffer buf = {
16
        .type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE,
17
        .memory = V4L2_MEMORY_DMABUF,
18
    };
19
    ioctl(cam, VIDIOC_DQBUF, &buf);
20
    int idx = buf.index;
21

22
    inference_on_dma_fd(dma_fds[idx]);
23
    display_on_dma_fd(dma_fds[idx]);
24

25
    ioctl(cam, VIDIOC_QBUF, &buf);
26
}

DQBUF로 frame ownership을 받고 QBUF로 돌려줍니다. 4-buffer ring이 보통이고, 그 사이 다른 frame이 채워집니다.

#GStreamer NVMM pipeline (Jetson)

1
gst-launch-1.0 \
2
  nvarguscamerasrc sensor-id=0 ! \
3
  'video/x-raw(memory:NVMM),width=1920,height=1080,format=NV12,framerate=60/1' ! \
4
  nvvidconv ! \
5
  nvinfer config-file-path=yolo.txt ! \
6
  nvtracker ll-config-file=tracker.yml ! \
7
  nvdsosd ! \
8
  nvegltransform ! nveglglessink

(memory:NVMM)이 붙은 caps는 entire pipeline이 NVMM/DMA-BUF로 zero-copy됩니다. Camera ISP → inference → display 전체가 CPU를 거치지 않습니다.

#libcamera — modern stack

1
#include <libcamera/libcamera.h>
2

3
camera->configure(config.get());
4

5
for (auto &fb : framebuffers) {
6
    auto req = camera->createRequest();
7
    req->addBuffer(stream, fb.get());
8
    camera->queueRequest(req.get());
9
}
10

11
/* requestCompleted signal */
12
camera->requestCompleted.connect([](Request *r) {
13
    auto &bufs = r->buffers();
14
    for (auto &[s, fb] : bufs) {
15
        int fd = fb->planes()[0].fd.get();
16
        process_dma_fd(fd);
17
    }
18
    r->reuse(Request::ReuseBuffers);
19
    camera->queueRequest(r);
20
});

libcamera는 Raspberry Pi 5·NXP·산업 카메라가 표준으로 채택한 modern stack입니다. DMA-BUF가 first-class입니다.

#Display — DRM/KMS PRIME

1
struct drm_prime_handle prime = { .fd = dma_fd };
2
ioctl(drm_fd, DRM_IOCTL_PRIME_FD_TO_HANDLE, &prime);
3

4
uint32_t handles[4] = { prime.handle };
5
uint32_t pitches[4] = { 1920 };
6
uint32_t offsets[4] = { 0 };
7
uint32_t fb_id;
8
drmModeAddFB2(drm_fd, 1920, 1080, DRM_FORMAT_NV12,
9
              handles, pitches, offsets, &fb_id, 0);
10
drmModeSetCrtc(drm_fd, crtc_id, fb_id, 0, 0, &conn_id, 1, &mode);

Camera DMA-BUF가 그대로 framebuffer가 되어 display HW가 read합니다. Compositor 없이 카메라 → 화면이 zero-copy로 흐릅니다.

#Color conversion in shader

1
#version 300 es
2
#extension GL_OES_EGL_image_external_essl3 : require
3
precision highp float;
4

5
uniform samplerExternalOES tex;   /* YUV NV12 직접 sample */
6
in vec2 v_tex;
7
out vec4 color;
8

9
void main() {
10
    color = texture(tex, v_tex);   /* driver가 자동 YUV→RGB */
11
}

samplerExternalOES는 driver가 YUV→RGB를 자동 수행합니다. CPU에서 conversion하지 않습니다.

#측정 / 성능 비교

1080p 60 fps × YOLOv8s 추론 + display, Jetson Orin Nano입니다.

1
Pipeline                                 fps   CPU 사용률   Memory BW
2
v4l2src ! videoconvert ! appsink          25    180%         3.8 GB/s
3
v4l2src ! nvvidconv ! appsink             45     90%         1.7 GB/s
4
nvarguscamerasrc ! nvvidconv ! nvinfer    60     20%         0.6 GB/s
5
                  (NVMM zero-copy)

CPU 사용률이 1/9, memory bandwidth가 1/6으로 줄어듭니다. 같은 hardware에서 frame rate 2.4배가 나옵니다.

Multi-camera 8 stream input (Orin AGX) 비교입니다.

구현	Total fps	Memory BW
8× user-space copy pipeline	80	18 GB/s (saturated)
8× NVMM zero-copy DeepStream	480	2.4 GB/s

자율주행 8-camera × 60 fps = 480 fps가 단일 보드에서 가능해지는 이유가 zero-copy입니다.

#자주 보는 함정

V4L2 MMAP을 zero-copy로 오해

1
req.memory = V4L2_MEMORY_MMAP;
2
/* user는 mmap된 buffer를 보고 zero-copy라 생각 */
3
/* 하지만 GPU·NPU에 넘기려면 copy 발생 */

GPU·NPU와 share하려면 V4L2_MEMORY_DMABUF를 씁니다. MMAP은 CPU 처리에만 zero-copy입니다.

DMA-BUF fd close 누락

1
ioctl(VIDIOC_EXPBUF);   /* fd 4개 */
2
/* close(fd) 빠뜨림 → buffer leak */

Stream stop 시 명시적으로 close합니다. RAII wrapper로 묶는 것이 안전합니다.

Camera·GPU page size 불일치

1
Camera 4 KB page · GPU MMU 64 KB page
2
→ alignment fail → import error

dma_buf_attach로 device 간 attribute를 negotiate하면 driver가 호환 가능한 layout을 협상합니다. Backend가 안 풀리면 contiguous allocator(CMA)로 fallback합니다.

Format mismatch on import

1
EGL import NV12, GL shader는 RGB texture로 sample
2
→ 화면 검정 또는 색 뒤틀림

NV12 import는 samplerExternalOES + YUV-aware shader를 씁니다.

USB camera로 zero-copy 시도

1
USB cam → URB → system memory copy → 어떤 trick도 zero-copy 안 됨

Zero-copy를 원하면 CSI camera + ISP path를 씁니다. USB는 본질적으로 한 번 copy가 일어납니다.

Format conversion을 CPU에서

1
yuv420_to_rgb_scalar(src, dst);   /* CPU 50% */

VIC·GPU shader로 옮기면 CPU가 거의 idle해집니다.

#정리

Zero-copy camera는 한 frame이 한 physical page를 유지하며 ISP·GPU·NPU·display를 통과하는 패턴입니다.
V4L2 V4L2_MEMORY_DMABUF로 카메라 buffer를 fd로 export합니다.
EGL EGL_LINUX_DMA_BUF_EXT 또는 CUDA cudaImportExternalMemory로 GPU에 import합니다.
Jetson NVMM caps (memory:NVMM)는 전체 GStreamer pipeline이 zero-copy로 동작합니다.
libcamera는 modern Linux camera stack이고 DMA-BUF가 first-class입니다.
DRM PRIME으로 카메라 buffer를 directly framebuffer로 쓰면 display까지 zero-copy됩니다.
USB camera는 본질적으로 한 번 copy됩니다. Zero-copy가 필요하면 CSI camera + ISP path를 씁니다.
Memory bandwidth는 edge SoC에서 가장 빠듯한 자원이고 zero-copy는 가장 큰 throughput 회복 기법입니다.

다음 편은 온디바이스 LLM입니다.

#관련 항목

6-05: Jetson
6-07: 온디바이스 LLM
3-03: Zero-Copy
1-04: Device Tree