首页 > 解决方案 > 如何在不先连接所有内容的情况下增量散列postgresql中的聚合结果

问题描述

我想计算一组结果的哈希(sha256),我知道该怎么做:

SELECT digest( string_agg(id_,':'), 'sha256') from mytable order by id_;

它可以工作,但它首先连接所有记录,最后计算哈希。我希望它是增量的,以避免大型数据集(数百万行)可能出现的内存问题,例如:

SELECT digest_agg(id_, ':', 'sha256') from mytable order by id_;

确保散列在途中逐行递增计算。

标签: postgresql

解决方案


好的,我终于为此编写了 ac 扩展名。我把它贴在这里,以防它对某人有用。

/* ------- OS Includes ------------ */
#include <stdio.h>
#include <openssl/rsa.h>
#include <openssl/pem.h>
#include <openssl/err.h>

/* ----- PostgreSQL Includes -------*/
#include "postgres.h"
#include "fmgr.h"
#include "funcapi.h"
#include "utils/builtins.h"
#include "utils/memutils.h"
#include "mb/pg_wchar.h"
#include "miscadmin.h"
PG_MODULE_MAGIC;

typedef struct sha256_state 
    {
    SHA256_CTX* sha256;
    bool has_data;
    } SHA256_STATE;

/* --------- Prototypes ---------- */
void _PG_init(void);
void _PG_fini(void);
Datum dv_sha256_agg_sfunc(PG_FUNCTION_ARGS);
Datum dv_sha256_agg_final(PG_FUNCTION_ARGS);
SHA256_STATE* sha256_init(void);
void sha256_update(SHA256_STATE* state, char* string, int32 string_size);
void sha256_final(SHA256_STATE* state, char outputBuffer[65]);

/* ------ Version1 convention ---- */
PG_FUNCTION_INFO_V1(dv_sha256_agg_sfunc);
PG_FUNCTION_INFO_V1(dv_sha256_agg_final);

// -----------------------------------------------------------
// Implementations
// -----------------------------------------------------------

void _PG_init(void) {/* Do nothing */}
void _PG_fini(void) {/* Do nothing */}

Datum dv_sha256_agg_sfunc(PG_FUNCTION_ARGS) {

    SHA256_STATE* state;
    bool is_first = PG_ARGISNULL(0);
    text* string;
    int32 string_size;
    char* data;
    char SEP[] = {'\n'}; // Hardcoded separator (can be improved)

    if (is_first) {
        // First iteration: Create the state
        state = sha256_init();
    } 
    else {
        // Next iterations: Restore the state
        state = (SHA256_STATE*) PG_GETARG_POINTER(0);
    }

    if (PG_ARGISNULL(1)) {
        PG_RETURN_POINTER(state);
    }

    // Get current chunk
    string = PG_GETARG_TEXT_PP(1);
    string_size = VARSIZE_ANY_EXHDR(string);
    data = VARDATA_ANY(string);

    // Add separator
    if (state->has_data) {
        sha256_update(state, SEP, 1);
    }

    // Update state
    sha256_update(state, data, string_size);
    state->has_data = true;

    // Updated state
    PG_RETURN_POINTER(state);

}

Datum dv_sha256_agg_final(PG_FUNCTION_ARGS) {
    SHA256_STATE* state;
    char out[65];
    text* hash = (text*) palloc(65 + VARHDRSZ);
    if (PG_ARGISNULL(0)) {
        PG_RETURN_NULL();
    }
    else {
        state = (SHA256_STATE*) PG_GETARG_POINTER(0);
        sha256_final(state, out);
        SET_VARSIZE(hash, 65 + VARHDRSZ);
        memcpy(VARDATA(hash), out, 65);
        PG_RETURN_TEXT_P(hash);
    }    
}

SHA256_STATE* sha256_init() {
    SHA256_STATE* state = (SHA256_STATE*) palloc(sizeof(SHA256_STATE));
    state->sha256 = (SHA256_CTX*) palloc(sizeof(SHA256_CTX));
    SHA256_Init(state->sha256);
    state->has_data = false;
    return state;
}

void sha256_update(SHA256_STATE* state, char* string, int32 string_size) {
    SHA256_Update(state->sha256, string, string_size);
}

void sha256_final(SHA256_STATE* state, char outputBuffer[65]) {
    int i;
    unsigned char hash[SHA256_DIGEST_LENGTH];
    SHA256_Final(hash, state->sha256);
    for(i = 0; i < SHA256_DIGEST_LENGTH; i++) {
        sprintf(outputBuffer + (i * 2), "%02x", hash[i]);
    }
    outputBuffer[64] = 0;
}

聚合定义:

CREATE FUNCTION dv_sha256_agg_sfunc(state internal, input text)
RETURNS internal AS 'MODULE_PATHNAME', 
'dv_sha256_agg_sfunc'
LANGUAGE C VOLATILE;

CREATE FUNCTION dv_sha256_agg_final(state internal)
RETURNS text AS 'MODULE_PATHNAME', 
'dv_sha256_agg_final'
LANGUAGE C VOLATILE;

CREATE AGGREGATE dv_sha256_agg(input text) (
    SFUNC = dv_sha256_agg_sfunc,
    STYPE = internal,
    FINALFUNC = dv_sha256_agg_final
);

测试:

select 
    dv_sha256_agg(id_::text), 
    encode(digest(string_agg(id_::text,E'\n'),'sha256'),'hex') 
from 
    generate_series(1,100) id_(id_);

结果

                          dv_sha256_agg                           |                              encode                              
------------------------------------------------------------------+------------------------------------------------------------------
 4187fe63fa78d8b4333e6ffc9122e0273ddf90251ced32e1e5b398639c193c87 | 4187fe63fa78d8b4333e6ffc9122e0273ddf90251ced32e1e5b398639c193c87
(1 row)

笔记:

  • 分隔符被硬编码为 \n
  • 空值被忽略

推荐阅读