Source: core/preprocess_files.js

/*
 * This file is part of PKM (Persistent Knowledge Monitor).
 * Copyright (c) 2020 Capgemini Group, Commissariat à l'énergie atomique et aux énergies alternatives,
 *                    OW2, Sysgo AG, Technikon, Tree Technology, Universitat Politècnica de València.
 * 
 * PKM is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License version 3 as published by
 * the Free Software Foundation.
 * 
 * PKM is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with PKM.  If not, see <https://www.gnu.org/licenses/>.
 */

/** Preprocess File documents. Split file document into file chunk documents.
 * 
 * @memberof PKM
 * @instance
 * @param {Array.<Object>} file_documents - file documents
 * @param {Object} [options] - options
 * 
 * @return {Promise<Array.<Object>>} a promise
 */
function preprocess_files(file_documents, options = {})
{
	return new Promise(function(resolve, reject)
	{
		let file_chunk_documents = [];
		const Threshold = 4096 * 1024; // 4 Mchars (shall be a multiple of 4 because some content are base64 encoded).
		// Note: Javascript string are UTF-16 (2 bytes per characters). UTF-8 characters may needs two UTF-16 characters.
		// Note: MongoDB has a strong limit of 16 MB for BSON documents in a collection
		
		// for each file document
		file_documents.forEach((file_document) =>
		{
			// if length of file content is above threshold,
			// split the file document into several file chunk documents,
			// otherwise keep file document as is
			if(file_document.filecontent.length > Threshold)
			{
				let chunk_id = 0;
				let offset = 0;
				
				while(offset < file_document.filecontent.length)
				{
					let file_chunk_document =
					{
						chunkId : chunk_id,
						filecontent : file_document.filecontent.slice(offset, offset + Threshold),
					};
					
					[ 'filename', 'fileType', 'fileMimeType', 'fileFormat', 'fileEncoding', 'gitWorkingTree', 'gitDirty', 'gitUnmerged' ].forEach((key) =>
					{
						if(file_document[key] !== undefined)
						{
							file_chunk_document[key] = file_document[key];
						}
					});
				
					file_chunk_documents.push(file_chunk_document);
					offset += Threshold;
					++chunk_id;
				}
			}
			else
			{
				file_chunk_documents.push(file_document);
			}
		});
		
		// serve file chunk documents
		resolve(file_chunk_documents);
	}.bind(this));
}

module.exports.preprocess_files = preprocess_files;