[C++] false sharing이란? (거짓 공유)

개념

false sharing은 멀티 쓰레드 환경 + CPU의 멀티 코어에서 발생된다.

cpu 내부의 코어와 코어간의 메모리 정보가 공유되어 하드웨어 적으로 병목 현상이 일어나는건다.

#include <iostream>
#include <thread>
#include <chrono>

long long num1 = 0;
long long num2 = 0;
long long num3 = 0;

void fun1() {
    for (long long i = 0; i < 1000000000; i++)
        num1 += 1;
}

void fun2() {
    for (long long i = 0; i < 1000000000; i++)
        num2 += 1;
}

void fun3() {
    for (long long i = 0; i < 2000000000; i++) {
        num3 += 1;
    }
}
int main() {
    auto beginTime = std::chrono::high_resolution_clock::now();

    std::thread t1(fun1);   //Multi Thread 실행
    std::thread t2(fun2);   //Multi Thread 실행

    t1.join(); t2.join();

    auto endTime = std::chrono::high_resolution_clock::now(); 
    std::chrono::duration<double> resultTime = endTime - beginTime;

    printf("%lld\n", num1 + num2);
    std::cout << resultTime.count() << std::endl;
    printf("--------------------\n");
    beginTime = std::chrono::high_resolution_clock::now();

    fun3(); //Single Thread 실행
    
    endTime = std::chrono::high_resolution_clock::now();
    resultTime = endTime - beginTime;
    printf("%lld\n", num3);
    std::cout << resultTime.count() << std::endl;
}

비슷하거나 빨라야 하는데 1초 차이도 아니고 6초 차이가 나버린다.

CPU의 캐시 구조

L3 캐시는 메모리로부터 자료를 받아온다, 그럼 해당 데이터를 L2 > L1 순으로 전달하게 된다. 그럼 각각 L1캐시에는 long long num과 long long num2의 데이터가 있을 것이다.

캐시는 자주 사용하는 데이터를 메모리까지(ram) 가지않고 메모리보다 더 빠른 캐시에 저장함으로써 좀 더 빠르게 데이터를 처리하려고 한다.

CPU 스펙에서 알 수 있듯이 CPU는 Cache Line에서 64 바이트씩 읽어 데이터를 처리한다.

Core 1에는 num1과 num2가 있다, 그 이유는 캐시에서 64 바이트씩 통째로 읽어오기 때문이다.

Core 2에는 num2만 있다. num2를 시작점으로 64 바이트를 통째로 읽어왔기 때문이다.

(이렇게 된 이유는 예제 코드에서의 num1과 num2가 붙어있기 때문에 64 바이트씩 읽어오게 될 경우, 데이터에 두개의 변수 정보가 들어갈 확률이 높다.)

그래서 Core 2는 문제없이 연산하게 되지만 Core 1은 좀 다르다. CPU에서는 캐시 일관성 (cache coherence)이라는 메커니즘이 존재한다. CPU는 그냥 계산만 처리하는 기계이다. 비록 Core 1에서 num2의 계산을 하지 않지만, 캐시 입장에선 데이터가 공유되어 무슨일이 일어날지 알 수 없다, 따라서 Core 1의 연산을 멈추고 num2의 값을 다시 받아온다, 일종의 동기화 작업이다.

이는 데이터의 오류를 줄이고자 진행되는 메커니즘이다, Thread에서의 lock과 같다고 생각하면 된다, 따라서 Core 1은 Core 2의 num2가 사라질 때까지 (데이터 처리가 끝날 때 까지) num2 값을 계속 받아오게 된다. (갖고있던 기존 64비트를 버리고 캐시에서)

Core 2는 계속해서 num2의 데이터 처리를 진행하지 않고 인텔 프로세스의 프로토콜인(MESI)에 의거하여 해당 데이터를 공유로 처리하고, 캐시 데이터에 기록을 계속하기 때문에 엄청난 성능 저하를 일으켜 위와 같은 시간 차이가 나온 것이다.

이를 해결하는 방법은 패딩(padding)을 이용해 데이터를 64바이트로 채워주는 것이다. 그럼 캐시간 데이터가 공유되는 상황을 막을 수 있다.

#include <iostream>
#include <thread>
#include <chrono>

alignas(64) long long num1 = 0; //바뀐 부분
alignas(64) long long num2 = 0; //바뀐 부분
long long num3 = 0;

void fun1() {
    for (long long i = 0; i < 1000000000; i++)
        num1 += 1;
}

void fun2() {
    for (long long i = 0; i < 1000000000; i++)
        num2 += 1;
}

void fun3() {
    for (long long i = 0; i < 2000000000; i++) {
        num3 += 1;
    }
}
int main() {
    auto beginTime = std::chrono::high_resolution_clock::now();

    std::thread t1(fun1);   //Multi Thread 실행
    std::thread t2(fun2);   //Multi Thread 실행

    t1.join(); t2.join();

    auto endTime = std::chrono::high_resolution_clock::now(); 
    std::chrono::duration<double> resultTime = endTime - beginTime;

    printf("%lld\n", num1 + num2);
    std::cout << resultTime.count() << std::endl;
    printf("--------------------\n");
    beginTime = std::chrono::high_resolution_clock::now();

    fun3(); //Single Thread 실행
    
    endTime = std::chrono::high_resolution_clock::now();
    resultTime = endTime - beginTime;
    printf("%lld\n", num3);
    std::cout << resultTime.count() << std::endl;
}

난수 예제)

#include <iostream>
#include <thread>
#include <chrono>
#include<cstdlib>

volatile long long num1 = 0; //바뀐 부분
volatile long long num2 = 0; //바뀐 부분
volatile long long num3 = 0;

void fun1() {
    for (long long i = 0; i < 100000000; i++)
        num1 += rand();
}

void fun2() {
    for (long long i = 0; i < 100000000; i++)
        num2 += rand();
}

void fun3() {
    for (long long i = 0; i < 200000000; i++) {
        num3 += rand();
    }
}

int main() {
    auto beginTime = std::chrono::high_resolution_clock::now();

    std::thread t1(fun1);   //Multi Thread 실행
    std::thread t2(fun2);   //Multi Thread 실행
    t1.join(); t2.join();

    auto endTime = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> resultTime = endTime - beginTime;

    printf("%lld\n", num1 + num2);
    std::cout << resultTime.count() << std::endl;
    printf("--------------------\n");
    beginTime = std::chrono::high_resolution_clock::now();

    fun3(); //Single Thread 실행

    endTime = std::chrono::high_resolution_clock::now();
    resultTime = endTime - beginTime;
    printf("%lld\n", num3);
    std::cout << resultTime.count() << std::endl;
}

바이트 패딩할 시)

#include <iostream>
#include <thread>
#include <chrono>
#include <cstdlib>

alignas(64) volatile long long num1 = 0; //바뀐 부분
alignas(64) volatile long long num2 = 0; //바뀐 부분
volatile long long num3 = 0;

void fun1() {
    for (long long i = 0; i < 100000000; i++)
        num1 += rand();
}

void fun2() {
    for (long long i = 0; i < 100000000; i++)
        num2 += rand();
}

void fun3() {
    for (long long i = 0; i < 200000000; i++) {
        num3 += rand();
    }
}

int main() {
    auto beginTime = std::chrono::high_resolution_clock::now();

    std::thread t1(fun1);   //Multi Thread 실행
    std::thread t2(fun2);   //Multi Thread 실행
    t1.join(); t2.join();

    auto endTime = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> resultTime = endTime - beginTime;

    printf("%lld\n", num1 + num2);
    std::cout << resultTime.count() << std::endl;
    printf("--------------------\n");
    beginTime = std::chrono::high_resolution_clock::now();

    fun3(); //Single Thread 실행

    endTime = std::chrono::high_resolution_clock::now();
    resultTime = endTime - beginTime;
    printf("%lld\n", num3);
    std::cout << resultTime.count() << std::endl;
}

자신의 CPU 캐시의 크기를 얻어오는 메크로도 존재한다

printf("%d\n", std::hardware_destructive_interference_size);    //output : 64

이외에도 struct나 class 단위도 데이터를 맞춰줄 수 있다

class alignas(32) A {
private:
    int num;
    char c;
    int arr[10];
};

struct alignas(64) B {
private:
    int num;
    char c;
    int arr[10];
};

윈도우의 visual studio에서는 같은 기능을 제공하는 명령어가 있다 __declspec(align(#))

class __declspec(align(64)) A {    //위치는 상관 없음
private:
    int num;
    char c;
    int arr[10];
};

__declspec(align(64)) struct B {    //위치는 상관 없음
private:
    int num;
    char c;
    int arr[10];
};

패딩 말고도 변수를 할당할 때 주소 값이 떨어지게끔 다른 곳에 선언하거나, openMP를 활용해 제어하는 방법이 있다

https://hwan-shell.tistory.com/230

저작자표시

'프로그래밍 언어 > C++' 카테고리의 다른 글

[C++] volatile 키워드 (0)	2025.03.03
[C++] cin.ignore와 버퍼에 대한 이해 (0)	2024.12.02
[C++] std::map을 value 기준으로 정렬하기 (0)	2024.12.01
[C++] set, map 정렬 기준 바꾸는 방법 (0)	2024.12.01
[C++] 문자열 뒤집는 방법 (0)	2024.11.13

개념

CPU의 캐시 구조

'프로그래밍 언어 > C++' 카테고리의 다른 글

티스토리툴바