首页 > 解决方案 > 带有超过 2GB RAM 限制的 AVX 和 AVX2 指令的 C 程序

问题描述

我正在编写一个 C 程序来定位以“-symbol”开头并以“-symbol”或双换行符 \n 结尾的引用文本。为了让它尽可能快,我使用了 AVX 和 AVX2 指令。我通过 SSH 在其上运行它的 PC 会使用超过 2GB 的 RAM 启动任何进程。

我不明白我的代码的哪一部分正在使用所有这些内存。谁能指导我正确的方向或给我一些见解?

这是被调用的功能。我使用的文本文件是一个 1.2GB 的 txt 文件。

检测报价.c

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <malloc.h>
#include <time.h>

#define DEFINE_INTERFACE 0
#define bad_call(exec) { print_usage(exec); return 1; }
#define bad_alloc() { fprintf(stderr, "out of memory\n"); exit(EXIT_FAILURE); }
#define test_alloc(expr) { if (!(expr)) { bad_alloc(); } }

void
print_usage(const char *exec)
{
    printf("Usage: %s (FILE|-)\n", exec);
    printf("       EXAMPLE: %s text.txt\n", exec);
    printf("       EXAMPLE: gzip -dc text.txt.gz | %s -\n", exec);
    printf("       EXAMPLE: echo \"my interesting \\\"test\\\"\" | %s -\n", exec);
}

// -- Text data structure -----------------------------------------------------

typedef struct {
    /** The text data to analyze. */
    char *characters;

    /** The number of characters in this text. */
    size_t len;

    /** The capacity of the `characters` buffer.*/
    size_t cap;
} Text;

Text
alloc_text()
{
    Text text;
    text.len = 0;
    text.cap = 1024 * 512; // half a megabyte
    text.characters = aligned_alloc(32, sizeof(char) * text.cap);
    test_alloc(text.characters);
    memset(text.characters, 0, text.cap);
    return text;
}

void
require_capacity(Text *text, size_t cap)
{
    // maintain an aditional 64 bytes of zeros after the last character
    if (text->cap > cap + 64)
        return;
    char *new_characters = aligned_alloc(32, sizeof(char) * text->cap * 2);
    test_alloc(new_characters);
    memset(new_characters, 0, text->cap * 2);
    memcpy(new_characters, text->characters, sizeof(char) * text->len);
    free(text->characters);

    text->cap *= 2;
    text->characters = new_characters;
}

void
dealloc_text(Text *text)
{
    text->cap = 0;
    text->len = 0;
    free(text->characters);
    text->characters = NULL;
}

void
load_text(char *file, Text *text)
{
    FILE *fh;
    if (strcmp(file, "-") == 0)
    {
        fh = stdin;
    }
    else
    {
        fh = fopen(file, "r");
    }
    if (fh == NULL)
    {
        fprintf(stderr, "cannot open file %s\n", file);
        exit(EXIT_FAILURE);
    }

    size_t chars_read = 0;
    while (1)
    {
        require_capacity(text, text->len + 1024);
        char *buf = text->characters + text->len;
        if ((chars_read = fread(buf, sizeof(char), 1024, fh)) == 0)
            break;

        if (ferror(fh))
        {
            fprintf(stderr, "error reading file %s\n", file);
            exit(EXIT_FAILURE);
        }

        text->len += chars_read;
    }

    if (strcmp(file, "-") != 0)
        fclose(fh);
}

void
print_text(Text *text)
{
    fwrite(text->characters, sizeof(char), text->len, stdout);
}

void
print_text_quote(Text *text, int quote_begin, int quote_end)
{
    if (quote_begin >= quote_end)
    {
        fprintf(stderr, "invalid quote begin and end positions: %d and %d\n",
                quote_begin, quote_end);
        exit(EXIT_FAILURE);
    }
    char *begin = text->characters + quote_begin;
    int len = quote_end - quote_begin + 1;
    printf("<<<");
    fwrite(begin, sizeof(char), len, stdout);
    printf(">>>");
    putchar('\n');
}

// -- Output data structure ---------------------------------------------------

typedef struct {
    size_t begin;
    size_t end;
} OutputPair;

typedef struct {
    OutputPair *pairs;
    size_t len;
    size_t cap;
} Output;

Output
alloc_output()
{
    Output output;
    output.len = 0;
    output.cap = 1024;
    output.pairs = malloc(sizeof(OutputPair) * output.cap);
    test_alloc(output.pairs);

    return output;
}

/** Function to push a quote starting at position `begin` and ending
 * at postion `end` to the output. */
void
push_pair(Output *output, int begin, int end)
{
    if (output->len == output->cap)
    {
        output->cap *= 2;
        output->pairs = realloc(output->pairs, sizeof(Output) * output->cap);
        test_alloc(output->pairs);
    }

    output->pairs[output->len].begin = begin;
    output->pairs[output->len].end = end;
    output->len++;
}

void
dealloc_output(Output *output)
{
    output->len = 0;
    output->cap = 0;
    free(output->pairs);
    output->pairs = NULL;
}


// ---External symbols: DEFINED IN `submission.c` -------------

void
detect_quotes(Text *text, Output *output);

#include "submission.c"

// ----------------------------------------------------------------------------

int
main(int argc, char **args)
{
    if (argc != 2) bad_call(args[0]);

    char *file = args[1];
    Text text = alloc_text();
    Output output = alloc_output();

    load_text(file, &text);
    //print_text(&text);

    clock_t begin, end;
    double time_spent;

    begin = clock();
    detect_quotes(&text, &output);

    end = clock();
    time_spent = (double)(end - begin) / CLOCKS_PER_SEC;

//     Debug print: print out all the quotes that were found
//    for (size_t i = 0; i < output.len; ++i)
//    {
//        size_t b = output.pairs[i].begin;
//        size_t e = output.pairs[i].end;
//        printf("quote %lu, %lu: ", b, e);
//        print_text_quote(&text, b, e);
//    }

    printf("\n%lu quotes detected in %f seconds", output.len, time_spent);
    printf("\nThat is %.2f MiB/s\n", (text.len / (1024*1024))/time_spent);

    dealloc_output(&output);
    dealloc_text(&text);
}


This is the main function
    
    int main(int argc, char **args){
        if (argc != 2) bad_call(args[0]);
    
        char *file = args[1];
        Text text = alloc_text();
        Output output = alloc_output();
    
        load_text(file, &text);
        //print_text(&text);
    
        clock_t begin, end;
        double time_spent;
    
        begin = clock();
        detect_quotes(&text, &output);
    
        end = clock();
        time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
    
    //     Debug print: print out all the quotes that were found
    //    for (size_t i = 0; i < output.len; ++i)
    //    {
    //        size_t b = output.pairs[i].begin;
    //        size_t e = output.pairs[i].end;
    //        printf("quote %lu, %lu: ", b, e);
    //        print_text_quote(&text, b, e);
    //    }
    
        printf("\n%lu quotes detected in %f seconds", output.len, time_spent);
        printf("\nThat is %.2f MiB/s\n", (text.len / (1024*1024))/time_spent);
    
        dealloc_output(&output);
        dealloc_text(&text);
    }

和提交.c

#include <immintrin.h>
#include <stdbool.h>
#include "stdint.h"

#ifndef DEFINE_INTERFACE
#include <stdio.h>

typedef struct {
    char *characters;
    size_t len;
    size_t cap;
} Text;

typedef struct Output Output;

void
push_pair(Output *output, int begin, int end);

#endif


void __attribute__ ((noinline))
detect_quotes(Text *text, Output *output) {
    __m256i q32 = _mm256_set1_epi8(0x22);
    __m256i nl32 = _mm256_set1_epi8(0x0A);

    int len32 = text->len / 32 * 32;
    len32 = 0;
    int start = 0;
    bool isStartSet = false;
    bool isNlActive = false;
    unsigned int nlPrevLast = 0;
    unsigned int nlThisLast = 0;
    int i = 0;

    for (; i < len32; i += 32) {
        nlCmp = _mm256_loadu_si256((__m256i *) &text->characters[i]);

        __m256i qCmp = _mm256_cmpeq_epi8(nlCmp, q32);
        __m256i nlCmp = _mm256_cmpeq_epi8(nlCmp, nl32);

        unsigned int qMask = _mm256_movemask_epi8(qCmp);
        unsigned int nlMask = _mm256_movemask_epi8(nlCmp);

        nlThisLast = (nlMask & 1 << 31) >> 31;
        nlMask = nlMask & ((nlMask >> 1) | nlPrevLast);

        if (qMask != 0 || (isStartSet && nlMask != 0)) {
            for (int j = 0; j < 32; ++j) {
                if (qMask & 1) {
                    if (isStartSet) {
                        push_pair(output, start, i + j);
                        isStartSet = false;
                    } else {
                        start = i + j;
                        isStartSet = true;
                    }
                } else if (nlMask & 1) {
                    if (isStartSet) {
                        push_pair(output, start, i + j);
                        isStartSet = false;
                    }
                }
                qMask >>= 1;
                nlMask >>= 1;
            }
        }
        nlPrevLast = nlThisLast;
    }
    if (nlPrevLast != 0){
        isNlActive = true;
    }
    for ( ; i < text->len; i++) {
        if (text->characters[i] == '"') {
            if (isStartSet) {
                push_pair(output, start, i);
                isStartSet = false;
            } else {
                start = i;
                isStartSet = true;
            }
            isNlActive = false;
        } else if (text->characters[i] == '\n') {
            if (isStartSet) {
                if (isNlActive) {
                    push_pair(output, start, i);
                } else {
                    isNlActive = true;
                }
            }
        } else {
            isNlActive = false;
        }
    }
}

它们是使用这个 make 文件编译的

CC = gcc
CFLAGS = -Wpedantic -Wall -Wextra -std=c11 -mavx -mavx2 -O3

detect_quotes: detect_quotes.c submission.c
    $(CC) $(CFLAGS) -o detect_quotes detect_quotes.c

detect_quotes_g: detect_quotes.c submission.c
    $(CC) $(CFLAGS) -g -o detect_quotes_g detect_quotes.c

detect_quotes_gprof: detect_quotes.c submission.c
    $(CC) $(CFLAGS) -g -pg -o detect_quotes_gprof detect_quotes.c

最后,可以从我的 Google Drive 下载文本文件的示例,这是一个 41 MB 的 txt 文件

要在编译后运行程序,使用 txt 文件作为第一个也是唯一的参数:detect_quotes.exe gutenbergsmall.txt

标签: cperformancememory-leaksavxavx2

解决方案


推荐阅读