/*
* This file is part of PKM (Persistent Knowledge Monitor).
* Copyright (c) 2020 Capgemini Group, Commissariat à l'énergie atomique et aux énergies alternatives,
* OW2, Sysgo AG, Technikon, Tree Technology, Universitat Politècnica de València.
*
* PKM is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License version 3 as published by
* the Free Software Foundation.
*
* PKM is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with PKM. If not, see <https://www.gnu.org/licenses/>.
*/
/** Preprocess File documents. Split file document into file chunk documents.
*
* @memberof PKM
* @instance
* @param {Array.<Object>} file_documents - file documents
* @param {Object} [options] - options
*
* @return {Promise<Array.<Object>>} a promise
*/
function preprocess_files(file_documents, options = {})
{
return new Promise(function(resolve, reject)
{
let file_chunk_documents = [];
const Threshold = 4096 * 1024; // 4 Mchars (shall be a multiple of 4 because some content are base64 encoded).
// Note: Javascript string are UTF-16 (2 bytes per characters). UTF-8 characters may needs two UTF-16 characters.
// Note: MongoDB has a strong limit of 16 MB for BSON documents in a collection
// for each file document
file_documents.forEach((file_document) =>
{
// if length of file content is above threshold,
// split the file document into several file chunk documents,
// otherwise keep file document as is
if(file_document.filecontent.length > Threshold)
{
let chunk_id = 0;
let offset = 0;
while(offset < file_document.filecontent.length)
{
let file_chunk_document =
{
chunkId : chunk_id,
filecontent : file_document.filecontent.slice(offset, offset + Threshold),
};
[ 'filename', 'fileType', 'fileMimeType', 'fileFormat', 'fileEncoding', 'gitWorkingTree', 'gitDirty', 'gitUnmerged' ].forEach((key) =>
{
if(file_document[key] !== undefined)
{
file_chunk_document[key] = file_document[key];
}
});
file_chunk_documents.push(file_chunk_document);
offset += Threshold;
++chunk_id;
}
}
else
{
file_chunk_documents.push(file_document);
}
});
// serve file chunk documents
resolve(file_chunk_documents);
}.bind(this));
}
module.exports.preprocess_files = preprocess_files;