1 /**
2  * This file is part of the Amalthea library.
3  * Copyright (C) 2019-2024 Eugene 'Vindex' Stulin
4  * Distributed under the BSL 1.0 or (at your option) the GNU LGPL 3.0 or later.
5  *
6  * The module implements recognition of file formats.
7  * Main function of this module:
8  *     `FileFormat getFileFormat(string filepath)`
9  * Recognition is carried out according to a complex algorithm based on
10  * a special file containing information about file formats.
11  * The default path of such file for a normal library installation is:
12  *     `/etc/amalthea-${DC}/fileformats.json`
13  * You can expand the search paths for format descriptions with:
14  *     `void addNewPathToFindFileFormats(string dir)`
15  */
16 
17 module amalthea.fileformats;
18 
19 import
20     std.algorithm,
21     std.array,
22     std.datetime,
23     std.file,
24     std.format,
25     std.json,
26     std.path,
27     std.process,
28     std.stdio,
29     std.string,
30     std.typecons,
31     std.utf;
32 
33 import amalthea.libcore;
34 import amalthea.dataprocessing : calcIntersection, copyArray;
35 import amalthea.fs : getRegularFiles, isSymlinkToDir;
36 
37 pragma(lib, "gio-2.0");
38 pragma(lib, "glib-2.0");
39 
40 
41 struct Signature {
42     /// Key byte sequence.
43     ubyte[] bytes;
44     /// Offset of the signature from the beginning of the file.
45     uint offset;
46 
47     this(ref return scope const Signature rhs) {
48         this.bytes = rhs.bytes.dup;
49         this.offset = rhs.offset;
50     }
51 }
52 
53 
54 /*******************************************************************************
55  * File format info.
56  */
57 struct FileFormat {
58     /// Format name.
59     string format;
60     /// Key byte sequences and offsets.
61     Signature[] signatures;
62     /// List of file suffixes used by the format.
63     string[] extensions;
64     /// Format brief description.
65     string description;
66     /// Big description. Reserved for future use.
67     string extendedDescription;
68     /// Format group name (Audio, Pictures, Archives, etc.).
69     string group;
70     /** If true, then the format is valid only
71      *  if both the extension and the signature match.
72      */
73     bool completeMatchOnly;
74 
75     /// Implements comparison.
76     int opCmp(in FileFormat rhs) const {
77         if (this.format > rhs.format) {
78             return 1;
79         }
80         if (this.format < rhs.format) {
81             return -1;
82         }
83         return 0;
84     }
85 
86     private void copy(ref scope const FileFormat rhs) {
87         this.format = rhs.format.idup;
88         this.signatures = copyArray(rhs.signatures);
89         this.extensions = rhs.extensions.dup;
90         this.description = rhs.description.dup;
91         this.extendedDescription = rhs.extendedDescription.dup;
92         this.group = rhs.group.idup;
93         this.completeMatchOnly = rhs.completeMatchOnly;
94     }
95 
96     this(ref return scope const FileFormat rhs) {
97         copy(rhs);
98     }
99 
100     ref FileFormat opAssign(ref scope const FileFormat rhs) {
101         copy(rhs);
102         return this;
103     }
104 }
105 
106 
107 /*******************************************************************************
108  * The function gets the FileFormat structure with info about file format.
109  *
110  * Params:
111  *     filepath = Path to required file.
112  *     additExt = File extension as hint for recognition algorithm.
113  *                Not needed in most cases. Can be used if for some reason
114  *                the file name does not have an extension.
115  *
116  * Returns: a FileFormat object containing some format information.
117  */
118 FileFormat getFileFormat(string filepath, string additExt) {
119     if (!filepath.exists || !filepath.isFile) {
120         throw new FileException("File is not regular or not readable.");
121     }
122     return signatureSearch(filepath, additExt);
123 }
124 
125 
126 /*******************************************************************************
127  * The function gets the FileFormat structure with info about file format.
128  *
129  * Params:
130  *     filepath = Path to required file.
131  *
132  * Returns: a FileFormat object containing some format information.
133  */
134 FileFormat getFileFormat(string filepath) {
135     return getFileFormat(filepath, "");
136 }
137 
138 
139 /*******************************************************************************
140  * The function returns true if specified file contains plain Unicode-text.
141  */
142 bool isUnicodeTextFile(string filepath) nothrow {
143     try {
144         File(filepath, "r").byLine.each!validate;
145     } catch (Exception e) {
146         return false;
147     }
148     return true;
149 }
150 
151 
152 private FileFormat[] searchByExtHint(
153     const FileFormat[] candidates, string additExt
154 ) {
155     additExt = additExt.toLower;
156     FileFormat[] newCandidatesWithExtensions;
157     FileFormat[] newCandidatesWithoutExtensions;
158     foreach(ftype; candidates) {
159         foreach(ext; ftype.extensions) {
160             if (ext == "*") {
161                 newCandidatesWithoutExtensions ~= cast(FileFormat)ftype;
162             } else if (additExt == ext) {
163                 newCandidatesWithExtensions ~= cast(FileFormat)ftype;
164             }
165         }
166     }
167     return newCandidatesWithoutExtensions ~ newCandidatesWithExtensions;
168 }
169 
170 
171 private FileFormat[] searchByExtensions(
172     const FileFormat[] candidates, string fpath
173 ) {
174     fpath = fpath.toLower;
175     FileFormat[] newCandidatesWithExtensions;
176     FileFormat[] newCandidatesWithoutExtensions;
177     foreach(ftype; candidates) {
178         foreach(ext; ftype.extensions) {
179             auto end = "." ~ ext;
180             if (ext == "*") {
181                 newCandidatesWithoutExtensions ~= cast(FileFormat)ftype;
182             } else if (fpath.endsWith(end) || fpath.canFind(end ~ ".")) {
183                 newCandidatesWithExtensions ~= cast(FileFormat)ftype;
184             }
185         }
186     }
187     return newCandidatesWithoutExtensions ~ newCandidatesWithExtensions;
188 }
189 
190 
191 static FileFormat[] searchBySignature(
192     const FileFormat[] candidates,
193     const ubyte[] data
194 ) {
195     FileFormat[] newCandidates;
196     foreach(ftype; candidates) {
197         if (ftype.signatures.length == 0) {
198             newCandidates ~= cast(FileFormat)ftype;
199         }
200         foreach(s; ftype.signatures) {
201             auto off = s.offset;
202             auto signature = s.bytes;
203             if (signature.length + off > data.length) {
204                 continue;
205             }
206             if (signature.empty) {
207                 newCandidates ~= cast(FileFormat)ftype;
208                 break;
209             } else if (signature == data[off .. off + signature.length]) {
210                 newCandidates ~= cast(FileFormat)ftype;
211                 break;
212             }
213         }
214     }
215     return newCandidates;
216 }
217 
218 
219 private FileFormat identifyLikelyFormat(
220     const FileFormat[] candidates,
221     string filepath
222 )
223 in {
224     assert(candidates.length > 1);
225 }
226 do {
227     size_t getMaxNumberOfBytes(const Signature[] signatures) {
228         size_t maxNumber;
229         foreach(s; signatures) {
230             if (s.bytes.length > maxNumber) {
231                 maxNumber = s.bytes.length;
232             }
233         }
234         return maxNumber;
235     }
236     FileFormat result = candidates[0];
237     FileFormat[] newCandidates = [result];
238     size_t maxNumberOfBytes = getMaxNumberOfBytes(result.signatures);
239     size_t currentFormatMaxSignatureLength;
240     foreach(f; candidates[1 .. $]) {
241         currentFormatMaxSignatureLength = getMaxNumberOfBytes(f.signatures);
242         if (currentFormatMaxSignatureLength > maxNumberOfBytes) {
243             maxNumberOfBytes = currentFormatMaxSignatureLength;
244             result = f;
245             newCandidates = [result];
246         } else if (currentFormatMaxSignatureLength == maxNumberOfBytes) {
247             result = f;
248             newCandidates ~= result;
249         }
250     }
251     // candidates with the same signature lengths
252     if (newCandidates.length > 1) {
253         size_t currentMaxExtensionLength;
254         foreach(ftype; newCandidates) {
255             foreach(ext; ftype.extensions) {
256                 auto end = "." ~ ext;
257                 if (filepath.endsWith(end)) {
258                     if (currentMaxExtensionLength < ext.length) {
259                         currentMaxExtensionLength = ext.length;
260                         result = ftype;
261                     }
262                 }
263             }
264         }
265     }
266     return result;
267 }
268 
269 
270 private bool identifyFacelessFile(string filepath) {
271     if (std.path.extension(filepath) == "" && std.file.getSize(filepath) == 0) {
272         return true;
273     }
274     return false;
275 }
276 
277 
278 private FileFormat getPlainTextFormat() {
279     return FileFormat("TXT", null, null, "Plain text document", "", "Text");
280 }
281 
282 
283 private FileFormat signatureSearch(string filepath, string additExt = "") {
284     auto f = File(filepath, "r");
285     ubyte[] buffer;
286     buffer.length = 1024;
287     ubyte[] data = f.rawRead(buffer);  // todo: mode with O_NOATIME
288 
289     FileFormat[] signCandidates = searchBySignature(fileformats, data);
290     FileFormat[] extCandidates = additExt.empty ?
291         searchByExtensions(fileformats, filepath) :
292         searchByExtHint(fileformats, additExt);
293     FileFormat[] candidates = calcIntersection(signCandidates, extCandidates);
294     if (candidates.empty) {
295         signCandidates = signCandidates
296             .filter!(c => !c.signatures.empty && !c.completeMatchOnly)
297             .array;
298         if (signCandidates.empty) {
299             return altFormatSearch(filepath);
300         } else if (signCandidates.length == 1) {
301             return signCandidates[0];
302         }
303         return identifyLikelyFormat(signCandidates, filepath);
304     }
305     if (candidates.length == 1) {
306         return candidates[0];
307     }
308     // where candidates.length > 1
309     return identifyLikelyFormat(candidates, filepath);
310 }
311 
312 
313 extern(C)
314 static char* g_content_type_guess(
315     const char* filename,
316     const byte* data,
317     size_t data_size,
318     bool* result_uncertain
319 );
320 
321 extern(C)
322 static void g_free(void* mem);
323 
324 extern(C)
325 char* g_content_type_get_mime_type(const char* type);
326 
327 extern(C)
328 char* g_content_type_get_description(const char* type);
329 
330 
331 /// Returns tuple with MIME type and its description.
332 auto contentTypeGuess(alias useEnglish=true)(string filepath) {
333     static if (useEnglish) {
334         auto lang = environment.get("LANG", "");
335         environment["LANG"] = "C";
336         scope(exit) environment["LANG"] = lang;
337     }
338 
339     auto raw = cast(byte[])read(filepath, 1024);
340     byte* data = raw.ptr;
341     char* type = g_content_type_guess(
342         filepath.toStringz, data, raw.length, null
343     );
344     string mimetype = type.fromStringz.idup;
345     char* descr = g_content_type_get_description(type);
346     string description = descr.fromStringz.idup;
347     g_free(type);
348     g_free(descr);
349     return tuple(mimetype, description);
350 }
351 
352 
353 private FileFormat altFormatSearch(string filepath) {
354     if (identifyFacelessFile(filepath)) {
355         return getPlainTextFormat();
356     }
357     string ext = std.path.extension(filepath).stripLeft('.');
358     auto mimeAndDescription = contentTypeGuess(filepath);
359     string mimetype = mimeAndDescription[0];
360     string mimegroup = mimetype.split('/')[0];
361     string mimeformat = mimetype.split('/')[1];
362     string description = mimeAndDescription[1];
363 
364     FileFormat ff;
365     ff.format = ext.empty ? mimeformat : ext.toUpper;
366     ff.description = description.capitalize;
367     if (ff.description == "Plain text document" && ff.format == "plain") {
368         ff.format = "TXT";
369     } else if (mimeformat.endsWith("src")) {
370         ff.format = ff.description;
371     }
372     if (isUnicodeTextFile(filepath) || mimegroup == "text") {
373         ff.group = "Text";
374     }
375     return ff;
376 }
377 
378 
379 /*******************************************************************************
380  * The functions allows to extend paths to find JSON files with file formats.
381  *
382  * Params:
383  *     dir = Directory to search for JSON files.
384  */
385 void addNewPathToFindFileFormats(string dir) {
386     if (!exists(dir) || !isDir(dir) || !isSymlinkToDir(dir)) {
387         throw new FileException(dir, ": directory not found.");
388     }
389     auto entries = getRegularFiles(dir, No.hidden)
390         .filter!(a => a.path.extension.toLower == ".json");
391     JSONValue[] jsonFormatArray;
392     foreach(entry; entries) {
393         jsonFormatArray ~= entry.path.readText.parseJSON.array;
394     }
395     extendFileFormatBase(jsonFormatArray);
396 }
397 
398 
399 immutable string canonBase;
400 shared static this() {
401     enum fileformatsJSONText = import("res/fileformats.json");
402     canonBase = format!"/etc/amalthea-%s/fileformats.json"(amaltheaCompiler);
403     JSONValue[] jsonFormatArray;
404     if (exists(canonBase)) {
405         string text = readText(canonBase);
406         jsonFormatArray = parseJSON(text).array;
407     } else {
408         jsonFormatArray = parseJSON(fileformatsJSONText).array;
409     }
410     extendFileFormatBase(jsonFormatArray);
411 }
412 
413 
414 private __gshared FileFormat[] fileformats;
415 
416 /// Returns array of supported file formats.
417 FileFormat[] getAllFileFormats() {
418     return fileformats.dup;
419 }
420 
421 
422 private void extendFileFormatBase(JSONValue[] jsonFormatArray) {
423     auto spec = singleSpec("%x");
424     foreach(j; jsonFormatArray) {
425         FileFormat newFormat;
426         newFormat.format = j["format"].str;
427         foreach(s; j["signatures"].array) {
428             Signature newSignature;
429             newSignature.offset = cast(uint)s["offset"].integer;
430             foreach(strByte; s["hex_signature"].str.split(' ')) {
431                 newSignature.bytes ~= unformatValue!ubyte(strByte, spec);
432             }
433             newFormat.signatures ~= newSignature;
434         }
435         foreach(ext; j["extensions"].array) {
436             newFormat.extensions ~= ext.str;
437         }
438         newFormat.description = j["description"].str;
439         if ("extended_description" in j) {
440             newFormat.extendedDescription = j["extended_description"].str;
441         }
442         newFormat.group = j["group"].str;
443         fileformats ~= newFormat;  // global variable
444     }
445 }