[C] 다른 문자 위치 찾기

Notice

LLVM 관련 잡일 의뢰 받습니다.

Recent Posts

Recent Comments

Link

« 2024/12 »
일	월	화	수	목	금	토
1	2	3	4	5	6	7
8	9	10	11	12	13	14
15	16	17	18	19	20	21
22	23	24	25	26	27	28
29	30	31

Tags more

Archives

Today

Total

관리 메뉴

Why should I know this?

[C] 다른 문자 위치 찾기 본문

Study

[C] 다른 문자 위치 찾기

die4taoam 2019. 11. 17. 03:11

C에 존재하는 함수 strcmp, strncmp는 String Compare 기능을 제공한다.

이 함수들은

1) 두 문자열의 동일 여부를 확인할 수 있게

2) 두 문자열을 하나씩 비교해 다른 문자가 있을 경우 그 차를 반환

한다.

그러므로 프로그래머는 오직 이 함수들을 통해 두 문자가 다른지 여부만 확인할 수 있다.

이를 조금 손봐서,

1) 두 문자열 간의 다른 문자가 존재하는 최초 위치를 제공하며

2) 동시에 탐색시간을 조금 단축시켜보는 트윅을 해보려고 한다.

int
glibc_strncmp (const char *s1, const char *s2, size_t n)
{
        uintptr_t s1_ori = (uintptr_t)s1;

        unsigned char c1 = '\0';
        unsigned char c2 = '\0';

        if (n >= 4)
        {
                size_t n4 = n >> 2;
                do
                {
                        c1 = (unsigned char) *s1++;
                        c2 = (unsigned char) *s2++;
                        if (c1 == '\0' || c1 != c2)
                                goto out;
                        c1 = (unsigned char) *s1++;
                        c2 = (unsigned char) *s2++;
                        if (c1 == '\0' || c1 != c2)
                                goto out;
                        c1 = (unsigned char) *s1++;
                        c2 = (unsigned char) *s2++;
                        if (c1 == '\0' || c1 != c2)
                                goto out;
                        c1 = (unsigned char) *s1++;
                        c2 = (unsigned char) *s2++;
                        if (c1 == '\0' || c1 != c2)
                                goto out;
                } while (--n4 > 0);
                n &= 3;
        }

        while (n > 0)
        {
                c1 = (unsigned char) *s1++;
                c2 = (unsigned char) *s2++;
                if (c1 == '\0' || c1 != c2)
                        goto out;
                n--;
        }

        return n;
out:
        return (uintptr_t)s1 - s1_ori;
}

먼저 glibc에서 긁어온 strncmp 함수를 조금 변형해서, 다른 문자의 차를 반환하는게 아닌,

위치를 반환하게 하도록 변경했다.

다음은 위의 코드를 조금 변경해서 비트연산을 통해 속도를 조금 향상시킨 버전이다.

int
bitcomp_strncmp(const char *s1, const char *s2, size_t n)
{
        uintptr_t s1_ori = (uintptr_t)s1;
        const int MAX_LEN = 8; // maximum size can be compared at once
        int chunk = 0;

        unsigned char c1 = '\0';
        unsigned char c2 = '\0';


        if (n >= MAX_LEN)
        {
                size_t n8 = n >> 3;
                do {
                        uint64_t s1_val =  *((uint64_t *)s1);
                        uint64_t s2_val = *((uint64_t *)s2);

                        uint64_t res = s1_val ^ s2_val;
                        int pos = 0;

                        if (res > 0) {
                                do {
                                        res <<= 8;
                                        pos++;
                                } while (res > 0);
                                s1 += (8 - pos) +1;
                                goto out;
                        }

                        s1 = s1 + MAX_LEN; s2 = s2 + MAX_LEN;

                } while(--n8 > 0);
                n &= 7;
        }

        while (n > 0)
        {
                c1 = (unsigned char) *s1++;
                c2 = (unsigned char) *s2++;
                if (c1 == '0' || c1 != c2)
                        goto out;
                n--;
        }

        return n;
out:
        return (uintptr_t)s1 - s1_ori;
}

아주 간단한 트윅이므로, 쉽게 이해할 수 있을거라 생각한다.

다음은 이를 테스트하기 위해 만든 코드들이다.

double get_time()
{
        struct timeval t;
        struct timezone tzp;
        gettimeofday(&t, &tzp);
        return t.tv_sec + t.tv_usec*1e-6;
}

char *
getRandomText(int min, int max)
{
        struct timespec ts;
        clock_gettime(CLOCK_MONOTONIC, &ts);
        srand((time_t)ts.tv_nsec);
        int len = rand() % max;
        if (len < min)
                len = min;

        char *res = malloc(len);

        for (int i = 0 ; i < len; i ++)
        {
                clock_gettime(CLOCK_MONOTONIC, &ts);
                srand((time_t)ts.tv_nsec);
                res[i] = (rand() % 0x19) + 0x61; // a start at 0x61
        }

        res[len] = '\0';

        return res;
}

char *
genText(char *target, int pos)
{
        char *temp = strdup(target);
        temp[pos] = '@';
        return temp;
}


int
main()
{
        struct timespec ts;
        int count = 0;

        int expected = 0;
        int glibc_res = 0;
        int bitcomp_res = 0;

        double time = 0.0;

        double glibc_running = 0.0;
        double bitcomp_running = 0.0;

#define min 5
#define max 25
        double glibc_table[max + min];
        double bitcomp_table[max + min];

        do {
                char *text = getRandomText(min, max);
                int len = strlen(text);
                clock_gettime(CLOCK_MONOTONIC, &ts);
                srand((time_t)ts.tv_nsec);
                int diff_pos = (rand() % len);
                char *test = genText(text, diff_pos);

                expected = diff_pos;

                time = get_time();
                glibc_res = glibc_strncmp(text, test, len);
                time = get_time() - time;
                glibc_table[len] += time;

                time = get_time();
                bitcomp_res = bitcomp_strncmp(text, test, len);
                time = get_time() - time;
                bitcomp_table[len] += time;

                if ((count % 10000) == 0) {
                        printf("[%08d] %s %s : %d %d %d\n", count,
                                        text, test, expected, glibc_res, bitcomp_res);
                }

                count++;
                free(text);
                // free(test);
        } while (count < 200000 && expected+1 == glibc_res && expected+1 == bitcomp_res);


        printf("Running time Table \n");

        for (int i = 0 ; i < max; i++) {
                printf("[%02d] GLIBC : %f\t BIT : %f \n", i,
                                glibc_table[i], bitcomp_table[i]);
        }
}

그 결과는 다음과 같다.

glibc에서 긁어온 strncmp를 일부 변경한 함수와 bitcomp_strncmp를 비교하면 bitcomp가 '대부분'의 경우 10~20% 좋은 성능을 나타내는 것을 확인 할 수 있다. 특히 bitcomp의 경우 같은 문자의 경우 한번에 8 글자를 비교할 수 있으므로 문자열의 길이에 영향을 '덜' 받을 수 있다는 장점이 있다.

# 성능 비교 - glibc의 strncmp 변경 vs bitcomp

$ gcc -O2 main.c -o test_1

$ ./test_1
[00000000] bqhkbpqaw bq@kbpqaw : 2 3 3
[00010000] tnxbdsojbpnvdeocquxs tnxbdsojbpnvdeocqu@s : 18 19 19
[00020000] sbtgxgamrwcicipifdighhvj sbtgxgamrwcicipifdighh@j : 22 23 23
[00030000] nuvjkksvaahsnfei nuvjkk@vaahsnfei : 6 7 7
[00040000] brcum @rcum : 0 1 1
[00050000] gqlupwexkdnkbknutcrbeg gqlupwex@dnkbknutcrbeg : 8 9 9
[00060000] tqivmfmfg @qivmfmfg : 0 1 1
[00070000] oaqdxnpoptiys oa@dxnpoptiys : 2 3 3
[00080000] vlilfftc vlilf@tc : 5 6 6
[00090000] oseugsteayeblpltfcdd oseugsteayeblpltfc@d : 18 19 19
[00100000] exmejirjyrvlftd exmejir@yrvlftd : 7 8 8
[00110000] vjhej vj@ej : 2 3 3
[00120000] mllclyhburfycunixti mllclyhb@rfycunixti : 8 9 9
[00130000] eqpyqmattfm eqpy@mattfm : 4 5 5
[00140000] vuiuiisv vuiui@sv : 5 6 6
[00150000] oktlmymldneacpqyv @ktlmymldneacpqyv : 0 1 1
[00160000] dmtwflidwvrfrvwajcriomht dmtwfl@dwvrfrvwajcriomht : 6 7 7
[00170000] letvhnbhggsxmdckukj letvhnbhggsxmdckuk@ : 18 19 19
[00180000] kmrsccuiilpgptdytq kmrsccuii@pgptdytq : 9 10 10
[00190000] ovawrojbifptglhuphjbr ovawrojbifptglhuph@br : 18 19 19
Running time Table
[00] GLIBC : 0.000000    BIT : 0.000000
[01] GLIBC : 0.000000    BIT : 0.000000
[02] GLIBC : 0.000000    BIT : 0.000000
[03] GLIBC : 0.000000    BIT : 0.000000
[04] GLIBC : 0.000000    BIT : 0.000000
[05] GLIBC : 0.001272    BIT : 0.001326
[06] GLIBC : 0.000284    BIT : 0.000203
[07] GLIBC : 0.000256    BIT : 0.000236
[08] GLIBC : 0.000242    BIT : 0.000253
[09] GLIBC : 0.000272    BIT : 0.000195
[10] GLIBC : 0.000257    BIT : 0.000259
[11] GLIBC : 0.000237    BIT : 0.000188
[12] GLIBC : 0.000242    BIT : 0.000212
[13] GLIBC : 0.000258    BIT : 0.000231
[14] GLIBC : 0.000411    BIT : 0.000198
[15] GLIBC : 0.000279    BIT : 0.000233
[16] GLIBC : 0.000269    BIT : 0.000224
[17] GLIBC : 0.000311    BIT : 0.000206
[18] GLIBC : 0.000240    BIT : 0.000234
[19] GLIBC : 0.000286    BIT : 0.000231
[20] GLIBC : 0.000278    BIT : 0.000245
[21] GLIBC : 0.000294    BIT : 0.000225
[22] GLIBC : 0.000347    BIT : 0.000206
[23] GLIBC : 0.000328    BIT : 0.000317
[24] GLIBC : 0.000323    BIT : 0.000274
[SUM] GLIBC : 0.006687   BIT : 0.005695

그렇다면 실제 glibc에 존재하는 strncmp와의 성능비교는 어떻게 나올까?

# 성능 비교 - glibc의 strncmp(순정) vs bitcomp

$ gcc -O2 main_strncmp.c -o test_2

$ ./test_2

$ ./test_2
[00000000] qlkel qlke@ : 4 44 5
[00010000] sfkxa sfkx@ : 4 33 5
[00020000] kglhsykboigrkuifdud kglhsykboi@rkuifdud : 10 39 11
[00030000] fjcge fjc@e : 3 39 4
[00040000] tmyhq @myhq : 0 52 1
[00050000] mnypyowrkvlvtvfipjbbq mnypyowrkvl@tvfipjbbq : 11 54 12
[00060000] ldswfqkejhcgat ldswf@kejhcgat : 5 49 6
[00070000] xeibejluv xeibejlu@ : 8 54 9
[00080000] eiglqmo eig@qmo : 3 44 4
[00090000] ehfactxqimkinuospexkog ehfactxqimkinuos@exkog : 16 48 17
[00100000] anrvnaklhugckl anrvnaklhugck@ : 13 44 14
[00110000] waheh w@heh : 1 33 2
[00120000] vtrsx v@rsx : 1 52 2
[00130000] kbfbqt @bfbqt : 0 43 1
[00140000] ssujx @sujx : 0 51 1
[00150000] rvjsatshf rvj@atshf : 3 51 4
[00160000] cqsesmeoeswi cqs@smeoeswi : 3 37 4
[00170000] gtsiywrpdvtnyv gt@iywrpdvtnyv : 2 51 3
[00180000] fudgaimpycseqyrigogf fudgaimpyc@eqyrigogf : 10 51 11
[00190000] fmfhldkpwhgagrarqnrxkur fmfhldkpwhgagrarq@rxkur : 17 46 18
Running time Table
[00] GLIBC : 0.000000    BIT : 0.000000
[01] GLIBC : 0.000000    BIT : 0.000000
[02] GLIBC : 0.000000    BIT : 0.000000
[03] GLIBC : 0.000000    BIT : 0.000000
[04] GLIBC : 0.000000    BIT : 0.000000
[05] GLIBC : 0.001408    BIT : 0.001287
[06] GLIBC : 0.000202    BIT : 0.000249
[07] GLIBC : 0.000193    BIT : 0.000266
[08] GLIBC : 0.000198    BIT : 0.000235
[09] GLIBC : 0.000211    BIT : 0.000191
[10] GLIBC : 0.000200    BIT : 0.000211
[11] GLIBC : 0.000224    BIT : 0.000226
[12] GLIBC : 0.000177    BIT : 0.000250
[13] GLIBC : 0.000217    BIT : 0.000234
[14] GLIBC : 0.000213    BIT : 0.000221
[15] GLIBC : 0.000178    BIT : 0.000246
[16] GLIBC : 0.000184    BIT : 0.000237
[17] GLIBC : 0.000197    BIT : 0.000229
[18] GLIBC : 0.000251    BIT : 0.000238
[19] GLIBC : 0.000230    BIT : 0.000242
[20] GLIBC : 0.000265    BIT : 0.000213
[21] GLIBC : 0.000191    BIT : 0.000244
[22] GLIBC : 0.000210    BIT : 0.000247
[23] GLIBC : 0.000251    BIT : 0.000252
[24] GLIBC : 0.000244    BIT : 0.000272
[SUM] GLIBC : 0.005442   BIT : 0.005790

순정과 비교하면 '일부'를 제외하고는 거의 대부분 성능이 안좋게 나오는 것을 확인할 수 있다...

BITCOMP의 변을 하자면, Glibc의 Strncmp는 두 문자열을 비교해 첫 번째 다른 문자를 찾아내면 그 '차'를 반환한다는 것이다. 예를 들어 'abc', 'abC'를 비교하면 99-67 = 32을 반환할 것이다. 만약 'abC'와 'abc'를 비교하면 67-99 = -32를 반환하게 된다.

또한 순정 strncmp는 사실 strncmp.c에 들어있는 내용과 많이 다르다.

이에 대한 내용은 strncmp.S를 참고하면 된다. 실제 코드는 훨씬 고성능을 낼 수 있게 되어 있다.

하지만 bitcomp 역시 아직 최적화 할 요소가 조금 남아있다고~??

'Study' 카테고리의 다른 글

Control Flow Integrity for COTS Binaries (0)	2023.02.04
Class linking 메커니즘 - jvm (0)	2022.09.30
역공학(=Reverse Engineering)을 위한 컴퓨터 기본 원리 (0)	2022.02.15
git 복구 (0)	2019.09.25
git 계정 설정 (0)	2017.12.17

'Study' Related Articles

Comments

Why should I know this?

[C] 다른 문자 위치 찾기 본문

[C] 다른 문자 위치 찾기

'Study' 카테고리의 다른 글

티스토리툴바