/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under both the BSD-style license (found in the
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
* in the COPYING file in the root directory of this source tree).
* You may select, at your option, one of the above-listed licenses.
*/
/*-**************************************
* Tuning parameters
****************************************/
#define MINRATIO 4 /* minimum nb of apparition to be selected in dictionary */
#define ZDICT_MAX_SAMPLES_SIZE (2000U << 20)
#define ZDICT_MIN_SAMPLES_SIZE (ZDICT_CONTENTSIZE_MIN * MINRATIO)
/*-**************************************
* Compiler Options
****************************************/
/* Unix Large Files support (>4GB) */
#define _FILE_OFFSET_BITS 64
#if (defined(__sun__) && (!defined(__LP64__))) /* Sun Solaris 32-bits requires specific definitions */
# ifndef _LARGEFILE_SOURCE
# define _LARGEFILE_SOURCE
# endif
#elif ! defined(__LP64__) /* No point defining Large file for 64 bit */
# ifndef _LARGEFILE64_SOURCE
# define _LARGEFILE64_SOURCE
# endif
#endif
/* trivial repetition cases */
if ( (MEM_read16(b+pos+0) == MEM_read16(b+pos+2))
||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3))
||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) {
/* skip and mark segment */
U16 const pattern16 = MEM_read16(b+pos+4);
U32 u, patternEnd = 6;
while (MEM_read16(b+pos+patternEnd) == pattern16) patternEnd+=2 ;
if (b[pos+patternEnd] == b[pos+patternEnd-1]) patternEnd++;
for (u=1; u<patternEnd; u++)
doneMarks[pos+u] = 1;
return solution;
}
/* look forward */
{ size_t length;
do {
end++;
length = ZDICT_count(b + pos, b + suffix[end]);
} while (length >= MINMATCHLENGTH);
}
/* look backward */
{ size_t length;
do {
length = ZDICT_count(b + pos, b + *(suffix+start-1));
if (length >=MINMATCHLENGTH) start--;
} while(length >= MINMATCHLENGTH);
}
/* exit if not found a minimum nb of repetitions */
if (end-start < minRatio) {
U32 idx;
for(idx=start; idx<end; idx++)
doneMarks[suffix[idx]] = 1;
return solution;
}
/* largest useful length */
memset(cumulLength, 0, sizeof(cumulLength));
cumulLength[maxLength-1] = lengthList[maxLength-1];
for (i=(int)(maxLength-2); i>=0; i--)
cumulLength[i] = cumulLength[i+1] + lengthList[i];
for (i=LLIMIT-1; i>=MINMATCHLENGTH; i--) if (cumulLength[i]>=minRatio) break;
maxLength = i;
/* reduce maxLength in case of final into repetitive data */
{ U32 l = (U32)maxLength;
BYTE const c = b[pos + maxLength-1];
while (b[pos+l-2]==c) l--;
maxLength = l;
}
if (maxLength < MINMATCHLENGTH) return solution; /* skip : no long-enough solution */
# undef DISPLAYUPDATE
# define DISPLAYUPDATE(l, ...) \
do { \
if (notificationLevel>=l) { \
if (ZDICT_clockSpan(displayClock) > refreshRate) { \
displayClock = clock(); \
DISPLAY(__VA_ARGS__); \
} \
if (notificationLevel>=4) fflush(stderr); \
} \
} while (0)
/* init */
DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
if (!suffix0 || !reverseSuffix || !doneMarks || !filePos) {
result = ERROR(memory_allocation);
goto _cleanup;
}
if (minRatio < MINRATIO) minRatio = MINRATIO;
memset(doneMarks, 0, bufferSize+16);
/* limit sample set size (divsufsort limitation)*/
if (bufferSize > ZDICT_MAX_SAMPLES_SIZE) DISPLAYLEVEL(3, "sample set too large : reduced to %u MB ...\n", (unsigned)(ZDICT_MAX_SAMPLES_SIZE>>20));
while (bufferSize > ZDICT_MAX_SAMPLES_SIZE) bufferSize -= fileSizes[--nbFiles];
/* sort */
DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (unsigned)(bufferSize>>20));
{ int const divSuftSortResult = divsufsort((const unsigned char*)buffer, suffix, (int)bufferSize, 0);
if (divSuftSortResult != 0) { result = ERROR(GENERIC); goto _cleanup; }
}
suffix[bufferSize] = (int)bufferSize; /* leads into noise */
suffix0[0] = (int)bufferSize; /* leads into noise */
/* build reverse suffix sort */
{ size_t pos;
for (pos=0; pos < bufferSize; pos++)
reverseSuffix[suffix[pos]] = (U32)pos;
/* note filePos tracks borders between samples.
It's not used at this stage, but planned to become useful in a later update */
filePos[0] = 0;
for (pos=1; pos<nbFiles; pos++)
filePos[pos] = (U32)(filePos[pos-1] + fileSizes[pos-1]);
}
/* ZDICT_flatLit() :
* rewrite `countLit` to contain a mostly flat but still compressible distribution of literals.
* necessary to avoid generating a non-compressible distribution that HUF_writeCTable() cannot encode.
*/
static void ZDICT_flatLit(unsigned* countLit)
{
int u;
for (u=1; u<256; u++) countLit[u] = 2;
countLit[0] = 4;
countLit[253] = 1;
countLit[254] = 1;
}
/* collect stats on all samples */
for (u=0; u<nbFiles; u++) {
ZDICT_countEStats(esr, ¶ms,
countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset,
(const char*)srcBuffer + pos, fileSizes[u],
notificationLevel);
pos += fileSizes[u];
}
if (notificationLevel >= 4) {
/* writeStats */
DISPLAYLEVEL(4, "Offset Code Frequencies : \n");
for (u=0; u<=offcodeMax; u++) {
DISPLAYLEVEL(4, "%2u :%7u \n", u, offcodeCount[u]);
} }
/* analyze, build stats, starting with literals */
{ size_t maxNbBits = HUF_buildCTable_wksp(hufTable, countLit, 255, huffLog, wksp, sizeof(wksp));
if (HUF_isError(maxNbBits)) {
eSize = maxNbBits;
DISPLAYLEVEL(1, " HUF_buildCTable error \n");
goto _cleanup;
}
if (maxNbBits==8) { /* not compressible : will fail on HUF_writeCTable() */
DISPLAYLEVEL(2, "warning : pathological dataset : literals are not compressible : samples are noisy or too regular \n");
ZDICT_flatLit(countLit); /* replace distribution by a fake "mostly flat but still compressible" distribution, that HUF_writeCTable() can encode */
maxNbBits = HUF_buildCTable_wksp(hufTable, countLit, 255, huffLog, wksp, sizeof(wksp));
assert(maxNbBits==9);
}
huffLog = (U32)maxNbBits;
}
/* looking for most common first offsets */
{ U32 offset;
for (offset=1; offset<MAXREPOFFSET; offset++)
ZDICT_insertSortCount(bestRepOffset, offset, repOffset[offset]);
}
/* note : the result of this phase should be used to better appreciate the impact on statistics */
if (maxDstSize<12) {
eSize = ERROR(dstSize_tooSmall);
DISPLAYLEVEL(1, "not enough space to write RepOffsets \n");
goto _cleanup;
}
# if 0
MEM_writeLE32(dstPtr+0, bestRepOffset[0].offset);
MEM_writeLE32(dstPtr+4, bestRepOffset[1].offset);
MEM_writeLE32(dstPtr+8, bestRepOffset[2].offset);
#else
/* at this stage, we don't use the result of "most common first offset",
* as the impact of statistics is not properly evaluated */
MEM_writeLE32(dstPtr+0, repStartValue[0]);
MEM_writeLE32(dstPtr+4, repStartValue[1]);
MEM_writeLE32(dstPtr+8, repStartValue[2]);
#endif
eSize += 12;
/**
* @returns the maximum repcode value
*/
static U32 ZDICT_maxRep(U32 const reps[ZSTD_REP_NUM])
{
U32 maxRep = reps[0];
int r;
for (r = 1; r < ZSTD_REP_NUM; ++r)
maxRep = MAX(maxRep, reps[r]);
return maxRep;
}
size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
const void* customDictContent, size_t dictContentSize,
const void* samplesBuffer, const size_t* samplesSizes,
unsigned nbSamples, ZDICT_params_t params)
{
size_t hSize;
#define HBUFFSIZE 256 /* should prove large enough for all entropy headers */
BYTE header[HBUFFSIZE];
int const compressionLevel = (params.compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : params.compressionLevel;
U32 const notificationLevel = params.notificationLevel;
/* The final dictionary content must be at least as large as the largest repcode */
size_t const minContentSize = (size_t)ZDICT_maxRep(repStartValue);
size_t paddingSize;
/* check conditions */
DEBUGLOG(4, "ZDICT_finalizeDictionary");
if (dictBufferCapacity < dictContentSize) return ERROR(dstSize_tooSmall);
if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) return ERROR(dstSize_tooSmall);
/* Shrink the content size if it doesn't fit in the buffer */
if (hSize + dictContentSize > dictBufferCapacity) {
dictContentSize = dictBufferCapacity - hSize;
}
/* Pad the dictionary content with zeros if it is too small */
if (dictContentSize < minContentSize) {
RETURN_ERROR_IF(hSize + minContentSize > dictBufferCapacity, dstSize_tooSmall,
"dictBufferCapacity too small to fit max repcode");
paddingSize = minContentSize - dictContentSize;
} else {
paddingSize = 0;
}
/* The dictionary consists of the header, optional padding, and the content.
* The padding comes before the content because the "best" position in the
* dictionary is the last byte.
*/
BYTE* const outDictHeader = (BYTE*)dictBuffer;
BYTE* const outDictPadding = outDictHeader + hSize;
BYTE* const outDictContent = outDictPadding + paddingSize;
/* First copy the customDictContent into its final location.
* `customDictContent` and `dictBuffer` may overlap, so we must
* do this before any other writes into the output buffer.
* Then copy the header & padding into the output buffer.
*/
memmove(outDictContent, customDictContent, dictContentSize);
memcpy(outDictHeader, header, hSize);
memset(outDictPadding, 0, paddingSize);
/* ZDICT_trainFromBuffer_legacy() :
* issue : samplesBuffer need to be followed by a noisy guard band.
* work around : duplicate the buffer, and add the noise */
size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity,
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
ZDICT_legacy_params_t params)
{
size_t result;
void* newBuff;
size_t const sBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
if (sBuffSize < ZDICT_MIN_SAMPLES_SIZE) return 0; /* not enough content => no dictionary */
newBuff = malloc(sBuffSize + NOISELENGTH);
if (!newBuff) return ERROR(memory_allocation);
memcpy(newBuff, samplesBuffer, sBuffSize);
ZDICT_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH); /* guard band, for end of buffer condition */