1 /** 2 * This file is part of the Amalthea library. 3 * Copyright (C) 2019-2024 Eugene 'Vindex' Stulin 4 * Distributed under the BSL 1.0 or (at your option) the GNU LGPL 3.0 or later. 5 * 6 * The module implements recognition of file formats. 7 * Main function of this module: 8 * `FileFormat getFileFormat(string filepath)` 9 * Recognition is carried out according to a complex algorithm based on 10 * a special file containing information about file formats. 11 * The default path of such file for a normal library installation is: 12 * `/etc/amalthea-${DC}/fileformats.json` 13 * You can expand the search paths for format descriptions with: 14 * `void addNewPathToFindFileFormats(string dir)` 15 */ 16 17 module amalthea.fileformats; 18 19 import 20 std.algorithm, 21 std.array, 22 std.datetime, 23 std.file, 24 std.format, 25 std.json, 26 std.path, 27 std.process, 28 std.stdio, 29 std.string, 30 std.typecons, 31 std.utf; 32 33 import amalthea.libcore; 34 import amalthea.dataprocessing : calcIntersection, copyArray; 35 import amalthea.fs : getRegularFiles, isSymlinkToDir; 36 37 pragma(lib, "gio-2.0"); 38 pragma(lib, "glib-2.0"); 39 40 41 struct Signature { 42 /// Key byte sequence. 43 ubyte[] bytes; 44 /// Offset of the signature from the beginning of the file. 45 uint offset; 46 47 this(ref return scope const Signature rhs) { 48 this.bytes = rhs.bytes.dup; 49 this.offset = rhs.offset; 50 } 51 } 52 53 54 /******************************************************************************* 55 * File format info. 56 */ 57 struct FileFormat { 58 /// Format name. 59 string format; 60 /// Key byte sequences and offsets. 61 Signature[] signatures; 62 /// List of file suffixes used by the format. 63 string[] extensions; 64 /// Format brief description. 65 string description; 66 /// Big description. Reserved for future use. 67 string extendedDescription; 68 /// Format group name (Audio, Pictures, Archives, etc.). 69 string group; 70 /** If true, then the format is valid only 71 * if both the extension and the signature match. 72 */ 73 bool completeMatchOnly; 74 75 /// Implements comparison. 76 int opCmp(in FileFormat rhs) const { 77 if (this.format > rhs.format) { 78 return 1; 79 } 80 if (this.format < rhs.format) { 81 return -1; 82 } 83 return 0; 84 } 85 86 private void copy(ref scope const FileFormat rhs) { 87 this.format = rhs.format.idup; 88 this.signatures = copyArray(rhs.signatures); 89 this.extensions = rhs.extensions.dup; 90 this.description = rhs.description.dup; 91 this.extendedDescription = rhs.extendedDescription.dup; 92 this.group = rhs.group.idup; 93 this.completeMatchOnly = rhs.completeMatchOnly; 94 } 95 96 this(ref return scope const FileFormat rhs) { 97 copy(rhs); 98 } 99 100 ref FileFormat opAssign(ref scope const FileFormat rhs) { 101 copy(rhs); 102 return this; 103 } 104 } 105 106 107 /******************************************************************************* 108 * The function gets the FileFormat structure with info about file format. 109 * 110 * Params: 111 * filepath = Path to required file. 112 * additExt = File extension as hint for recognition algorithm. 113 * Not needed in most cases. Can be used if for some reason 114 * the file name does not have an extension. 115 * 116 * Returns: a FileFormat object containing some format information. 117 */ 118 FileFormat getFileFormat(string filepath, string additExt) { 119 if (!filepath.exists || !filepath.isFile) { 120 throw new FileException("File is not regular or not readable."); 121 } 122 return signatureSearch(filepath, additExt); 123 } 124 125 126 /******************************************************************************* 127 * The function gets the FileFormat structure with info about file format. 128 * 129 * Params: 130 * filepath = Path to required file. 131 * 132 * Returns: a FileFormat object containing some format information. 133 */ 134 FileFormat getFileFormat(string filepath) { 135 return getFileFormat(filepath, ""); 136 } 137 138 139 /******************************************************************************* 140 * The function returns true if specified file contains plain Unicode-text. 141 */ 142 bool isUnicodeTextFile(string filepath) nothrow { 143 try { 144 File(filepath, "r").byLine.each!validate; 145 } catch (Exception e) { 146 return false; 147 } 148 return true; 149 } 150 151 152 private FileFormat[] searchByExtHint( 153 const FileFormat[] candidates, string additExt 154 ) { 155 additExt = additExt.toLower; 156 FileFormat[] newCandidatesWithExtensions; 157 FileFormat[] newCandidatesWithoutExtensions; 158 foreach(ftype; candidates) { 159 foreach(ext; ftype.extensions) { 160 if (ext == "*") { 161 newCandidatesWithoutExtensions ~= cast(FileFormat)ftype; 162 } else if (additExt == ext) { 163 newCandidatesWithExtensions ~= cast(FileFormat)ftype; 164 } 165 } 166 } 167 return newCandidatesWithoutExtensions ~ newCandidatesWithExtensions; 168 } 169 170 171 private FileFormat[] searchByExtensions( 172 const FileFormat[] candidates, string fpath 173 ) { 174 fpath = fpath.toLower; 175 FileFormat[] newCandidatesWithExtensions; 176 FileFormat[] newCandidatesWithoutExtensions; 177 foreach(ftype; candidates) { 178 foreach(ext; ftype.extensions) { 179 auto end = "." ~ ext; 180 if (ext == "*") { 181 newCandidatesWithoutExtensions ~= cast(FileFormat)ftype; 182 } else if (fpath.endsWith(end) || fpath.canFind(end ~ ".")) { 183 newCandidatesWithExtensions ~= cast(FileFormat)ftype; 184 } 185 } 186 } 187 return newCandidatesWithoutExtensions ~ newCandidatesWithExtensions; 188 } 189 190 191 static FileFormat[] searchBySignature( 192 const FileFormat[] candidates, 193 const ubyte[] data 194 ) { 195 FileFormat[] newCandidates; 196 foreach(ftype; candidates) { 197 if (ftype.signatures.length == 0) { 198 newCandidates ~= cast(FileFormat)ftype; 199 } 200 foreach(s; ftype.signatures) { 201 auto off = s.offset; 202 auto signature = s.bytes; 203 if (signature.length + off > data.length) { 204 continue; 205 } 206 if (signature.empty) { 207 newCandidates ~= cast(FileFormat)ftype; 208 break; 209 } else if (signature == data[off .. off + signature.length]) { 210 newCandidates ~= cast(FileFormat)ftype; 211 break; 212 } 213 } 214 } 215 return newCandidates; 216 } 217 218 219 private FileFormat identifyLikelyFormat( 220 const FileFormat[] candidates, 221 string filepath 222 ) 223 in { 224 assert(candidates.length > 1); 225 } 226 do { 227 size_t getMaxNumberOfBytes(const Signature[] signatures) { 228 size_t maxNumber; 229 foreach(s; signatures) { 230 if (s.bytes.length > maxNumber) { 231 maxNumber = s.bytes.length; 232 } 233 } 234 return maxNumber; 235 } 236 FileFormat result = candidates[0]; 237 FileFormat[] newCandidates = [result]; 238 size_t maxNumberOfBytes = getMaxNumberOfBytes(result.signatures); 239 size_t currentFormatMaxSignatureLength; 240 foreach(f; candidates[1 .. $]) { 241 currentFormatMaxSignatureLength = getMaxNumberOfBytes(f.signatures); 242 if (currentFormatMaxSignatureLength > maxNumberOfBytes) { 243 maxNumberOfBytes = currentFormatMaxSignatureLength; 244 result = f; 245 newCandidates = [result]; 246 } else if (currentFormatMaxSignatureLength == maxNumberOfBytes) { 247 result = f; 248 newCandidates ~= result; 249 } 250 } 251 // candidates with the same signature lengths 252 if (newCandidates.length > 1) { 253 size_t currentMaxExtensionLength; 254 foreach(ftype; newCandidates) { 255 foreach(ext; ftype.extensions) { 256 auto end = "." ~ ext; 257 if (filepath.endsWith(end)) { 258 if (currentMaxExtensionLength < ext.length) { 259 currentMaxExtensionLength = ext.length; 260 result = ftype; 261 } 262 } 263 } 264 } 265 } 266 return result; 267 } 268 269 270 private bool identifyFacelessFile(string filepath) { 271 if (std.path.extension(filepath) == "" && std.file.getSize(filepath) == 0) { 272 return true; 273 } 274 return false; 275 } 276 277 278 private FileFormat getPlainTextFormat() { 279 return FileFormat("TXT", null, null, "Plain text document", "", "Text"); 280 } 281 282 283 private FileFormat signatureSearch(string filepath, string additExt = "") { 284 auto f = File(filepath, "r"); 285 ubyte[] buffer; 286 buffer.length = 1024; 287 ubyte[] data = f.rawRead(buffer); // todo: mode with O_NOATIME 288 289 FileFormat[] signCandidates = searchBySignature(fileformats, data); 290 FileFormat[] extCandidates = additExt.empty ? 291 searchByExtensions(fileformats, filepath) : 292 searchByExtHint(fileformats, additExt); 293 FileFormat[] candidates = calcIntersection(signCandidates, extCandidates); 294 if (candidates.empty) { 295 signCandidates = signCandidates 296 .filter!(c => !c.signatures.empty && !c.completeMatchOnly) 297 .array; 298 if (signCandidates.empty) { 299 return altFormatSearch(filepath); 300 } else if (signCandidates.length == 1) { 301 return signCandidates[0]; 302 } 303 return identifyLikelyFormat(signCandidates, filepath); 304 } 305 if (candidates.length == 1) { 306 return candidates[0]; 307 } 308 // where candidates.length > 1 309 return identifyLikelyFormat(candidates, filepath); 310 } 311 312 313 extern(C) 314 static char* g_content_type_guess( 315 const char* filename, 316 const byte* data, 317 size_t data_size, 318 bool* result_uncertain 319 ); 320 321 extern(C) 322 static void g_free(void* mem); 323 324 extern(C) 325 char* g_content_type_get_mime_type(const char* type); 326 327 extern(C) 328 char* g_content_type_get_description(const char* type); 329 330 331 /// Returns tuple with MIME type and its description. 332 auto contentTypeGuess(alias useEnglish=true)(string filepath) { 333 static if (useEnglish) { 334 auto lang = environment.get("LANG", ""); 335 environment["LANG"] = "C"; 336 scope(exit) environment["LANG"] = lang; 337 } 338 339 auto raw = cast(byte[])read(filepath, 1024); 340 byte* data = raw.ptr; 341 char* type = g_content_type_guess( 342 filepath.toStringz, data, raw.length, null 343 ); 344 string mimetype = type.fromStringz.idup; 345 char* descr = g_content_type_get_description(type); 346 string description = descr.fromStringz.idup; 347 g_free(type); 348 g_free(descr); 349 return tuple(mimetype, description); 350 } 351 352 353 private FileFormat altFormatSearch(string filepath) { 354 if (identifyFacelessFile(filepath)) { 355 return getPlainTextFormat(); 356 } 357 string ext = std.path.extension(filepath).stripLeft('.'); 358 auto mimeAndDescription = contentTypeGuess(filepath); 359 string mimetype = mimeAndDescription[0]; 360 string mimegroup = mimetype.split('/')[0]; 361 string mimeformat = mimetype.split('/')[1]; 362 string description = mimeAndDescription[1]; 363 364 FileFormat ff; 365 ff.format = ext.empty ? mimeformat : ext.toUpper; 366 ff.description = description.capitalize; 367 if (ff.description == "Plain text document" && ff.format == "plain") { 368 ff.format = "TXT"; 369 } else if (mimeformat.endsWith("src")) { 370 ff.format = ff.description; 371 } 372 if (isUnicodeTextFile(filepath) || mimegroup == "text") { 373 ff.group = "Text"; 374 } 375 return ff; 376 } 377 378 379 /******************************************************************************* 380 * The functions allows to extend paths to find JSON files with file formats. 381 * 382 * Params: 383 * dir = Directory to search for JSON files. 384 */ 385 void addNewPathToFindFileFormats(string dir) { 386 if (!exists(dir) || !isDir(dir) || !isSymlinkToDir(dir)) { 387 throw new FileException(dir, ": directory not found."); 388 } 389 auto entries = getRegularFiles(dir, No.hidden) 390 .filter!(a => a.path.extension.toLower == ".json"); 391 JSONValue[] jsonFormatArray; 392 foreach(entry; entries) { 393 jsonFormatArray ~= entry.path.readText.parseJSON.array; 394 } 395 extendFileFormatBase(jsonFormatArray); 396 } 397 398 399 immutable string canonBase; 400 shared static this() { 401 enum fileformatsJSONText = import("res/fileformats.json"); 402 canonBase = format!"/etc/amalthea-%s/fileformats.json"(amaltheaCompiler); 403 JSONValue[] jsonFormatArray; 404 if (exists(canonBase)) { 405 string text = readText(canonBase); 406 jsonFormatArray = parseJSON(text).array; 407 } else { 408 jsonFormatArray = parseJSON(fileformatsJSONText).array; 409 } 410 extendFileFormatBase(jsonFormatArray); 411 } 412 413 414 private __gshared FileFormat[] fileformats; 415 416 /// Returns array of supported file formats. 417 FileFormat[] getAllFileFormats() { 418 return fileformats.dup; 419 } 420 421 422 private void extendFileFormatBase(JSONValue[] jsonFormatArray) { 423 auto spec = singleSpec("%x"); 424 foreach(j; jsonFormatArray) { 425 FileFormat newFormat; 426 newFormat.format = j["format"].str; 427 foreach(s; j["signatures"].array) { 428 Signature newSignature; 429 newSignature.offset = cast(uint)s["offset"].integer; 430 foreach(strByte; s["hex_signature"].str.split(' ')) { 431 newSignature.bytes ~= unformatValue!ubyte(strByte, spec); 432 } 433 newFormat.signatures ~= newSignature; 434 } 435 foreach(ext; j["extensions"].array) { 436 newFormat.extensions ~= ext.str; 437 } 438 newFormat.description = j["description"].str; 439 if ("extended_description" in j) { 440 newFormat.extendedDescription = j["extended_description"].str; 441 } 442 newFormat.group = j["group"].str; 443 fileformats ~= newFormat; // global variable 444 } 445 }