1 /* This file is part of the Amalthea library. 2 * 3 * Copyright (C) 2018-2021 Eugene 'Vindex' Stulin 4 * 5 * Distributed under the Boost Software License 1.0 or (at your option) 6 * the GNU Lesser General Public License 3.0 or later. 7 */ 8 9 module amalthea.net; 10 11 public import amalthea.libcore; 12 13 import amalthea.dataprocessing, 14 amalthea.encoding, 15 amalthea.fs; 16 17 import std.algorithm, std.format, std.range, std.regex, std.string; 18 import std.net.curl; 19 20 static alias stdGet = std.net.curl.get; 21 static alias stdPost = std.net.curl.post; 22 23 alias urlEncode = amalthea.dataprocessing.urlEncode; 24 25 version (DigitalMars) 26 alias StdNetException = std.net.curl.HTTPStatusException; 27 else 28 alias StdNetException = std.net.curl.CurlException; 29 30 31 /******************************************************************************* 32 * Exception for common errors in this module. 33 */ 34 class AmaltheaNetException : Exception { 35 this(string msg, string file = __FILE__, size_t line = __LINE__) { 36 super(msg, file, line); 37 } 38 } 39 40 41 /******************************************************************************* 42 * GET request to raw content. 43 * Returns: Byte array. 44 */ 45 ubyte[] getRaw(string url) { 46 auto client = HTTP(); 47 client.verifyPeer = false; 48 client.method = HTTP.Method.get; 49 client.url = url; 50 HTTP.StatusLine statusLine; 51 import std.array : appender; 52 auto content = appender!(ubyte[])(); 53 client.onReceive = (ubyte[] data) { 54 content ~= data; 55 return data.length; 56 }; 57 client.onReceiveStatusLine = (HTTP.StatusLine l) { statusLine = l; }; 58 client.perform(); 59 if (statusLine.code / 100 != 2) { 60 string exceptionMsg = format( 61 "HTTP request returned status code %d (%s)", 62 statusLine.code, statusLine.reason 63 ); 64 throw new HTTPStatusException(statusLine.code, exceptionMsg); 65 } 66 return content.data; 67 } 68 69 70 /******************************************************************************* 71 * POST request. 72 */ 73 T[] post(T = char)(const(char)[] url, 74 string[string] postDict, 75 HTTP conn = HTTP()) 76 if (is(T == char) || is(T == ubyte)) { 77 return stdPost!T(url, urlEncode(postDict), conn); 78 } 79 80 81 /******************************************************************************* 82 * POST request to get an answer with raw content. 83 * Returns: Byte array. 84 */ 85 ubyte[] postRaw(const(char)[] url, 86 string[string] postDict, 87 HTTP conn = HTTP()) { 88 return amalthea.net.post!ubyte(url, postDict, conn); 89 } 90 91 92 /******************************************************************************* 93 * Gets HTTP response headers by URL. 94 */ 95 string[string] getHeaders(in char[] url) { 96 auto http = HTTP(url); 97 http.perform; 98 return http.responseHeaders; 99 } 100 101 102 /******************************************************************************* 103 * Gets HTTP content type by URL. 104 */ 105 string getContentType(in char[] url) { 106 auto header = getHeaders(url); 107 string contentType; 108 foreach(key, value; header) { 109 if (key.toLower == "content-type") { 110 contentType = value; 111 break; 112 } 113 } 114 return contentType; 115 } 116 117 118 /******************************************************************************* 119 * Gets content charset (possibly empty) by URL. 120 */ 121 string getCharset(const(char)[] url) { 122 string contentType = getContentType(url); 123 return extractCharsetFromContentType(contentType); 124 } 125 126 127 /******************************************************************************* 128 * Checks if the URL is a link to HTML page. 129 */ 130 bool isLinkToHTML(string url) { 131 try { 132 string contentType = getContentType(url); 133 if (contentType.canFind("text/html")) { 134 return true; 135 } 136 } catch(Exception e) { 137 return false; 138 } 139 return false; 140 } 141 142 143 /******************************************************************************* 144 * Get text content as amalthea.encoding.UniString by URL. 145 * The implementation of 'get' from the standard library is taken as a basis. 146 */ 147 UniString getPage(string url) { 148 auto client = HTTP(); 149 client.verifyPeer = false; 150 client.method = HTTP.Method.get; 151 client.url = url; 152 HTTP.StatusLine statusLine; 153 import std.array : appender; 154 auto content = appender!(ubyte[])(); 155 client.onReceive = (ubyte[] data) { 156 content ~= data; 157 return data.length; 158 }; 159 string contentType; 160 string charset; 161 client.onReceiveHeader = (in char[] key, 162 in char[] value) { 163 auto lowerKey = key.idup.toLower; 164 if (lowerKey == "content-length") { 165 import std.conv : to; 166 content.reserve(value.to!size_t); 167 } else if (lowerKey == "content-type") { 168 contentType = value.idup; 169 charset = extractCharsetFromContentType(contentType); 170 } 171 }; 172 client.onReceiveStatusLine = (HTTP.StatusLine l) { statusLine = l; }; 173 client.perform(); 174 175 if (statusLine.code / 100 != 2) { 176 string exceptionMsg = format( 177 "HTTP request returned status code %d (%s)", 178 statusLine.code, statusLine.reason 179 ); 180 throw new HTTPStatusException(statusLine.code, exceptionMsg); 181 } 182 if (charset.empty) { 183 charset = "UTF-8"; 184 } 185 auto page = UniString(charset, content.data); 186 return page; 187 } 188 189 190 191 /******************************************************************************* 192 * This function searches all elements from HTML page with a specific tag. 193 */ 194 auto getElementsByTag(string html, string tag) { 195 return getElementsByTagAndAttribute(html, tag); 196 } 197 198 199 /******************************************************************************* 200 * This function searches all elements by a specific tag and an attribute. 201 * Returns: Page element info (tag name and possible attributes) 202 * and content of the element as tuple with two elements. 203 */ 204 auto getElementsByTagAndAttribute(string html, 205 string tag, 206 string attrName = "", 207 string attrValue = "") { 208 html = html.replace("\n", " "); 209 auto lowerTag = tag.toLower; 210 auto upperTag = tag.toUpper; 211 html = html.replace("<"~upperTag~" ", "\n<"~lowerTag~" ") 212 .replace("<"~upperTag~">", "\n<"~lowerTag~">") 213 .replace("</"~upperTag~" ", "</"~lowerTag~" \n") 214 .replace("</"~upperTag~">", "</"~lowerTag~">\n"); 215 html = html.replace("<"~lowerTag~" ", "\n<"~lowerTag~" ") 216 .replace("<"~lowerTag~">", "\n<"~lowerTag~">") 217 .replace("</"~lowerTag~" ", "</"~lowerTag~" \n") 218 .replace("</"~lowerTag~">", "</"~lowerTag~">\n"); 219 if (!attrValue.empty) attrValue = ` *= *"?` ~ attrValue ~ `"?`; 220 string openingTag, closingTag; 221 openingTag = attrName.empty ? tag : tag~" "; 222 closingTag = tag; 223 string e; 224 e = format!`^<(?P<declaration>%s[^>]*%s%s *[^>]*)>(?P<content>.*)</%s *>$` 225 (openingTag, attrName, attrValue, closingTag); 226 bool pairedTag = true; 227 auto r = regex(e, "im"); 228 if (count(matchAll(html, r)) == 0) { 229 e = format!`^<(?P<declaration>%s [^>]*%s%s *[^>]*)>` 230 (tag, attrName, attrValue); 231 pairedTag = false; 232 r = regex(e, "im"); 233 } 234 Tuple!(string, "declaration", string, "content")[] elements; 235 foreach(c; matchAll(html, r)) { 236 auto decl = c["declaration"].replace("\t", " "); 237 decl = decl.removeDuplicateConsecutiveSubstring(" "); 238 elements ~= Tuple!(string, "declaration", string, "content") 239 (decl, pairedTag ? c["content"] : ""); 240 } 241 return elements; 242 } 243 244 245 /******************************************************************************* 246 * This function returns title of Internet-page. 247 */ 248 string getHTMLPageTitle(string address) { 249 string html = getPage(address).toString; 250 html.replaceSpecialMnemonics; 251 auto res = getElementsByTag(html, "title"); 252 if (res.empty) return ""; 253 return res[0].content; 254 } 255 256 257 /******************************************************************************* 258 * Search and replace special characters in HTML for normal view. 259 */ 260 ref string replaceSpecialMnemonics(return ref string HTMLText) { 261 import std.string; 262 string[string] specialHTMLSymbols = [ 263 "¡" : "¡", 264 "¢" : "¢", 265 "£" : "£", 266 "¤" : "¤", 267 "¥" : "¥", 268 "¦" : "¦", 269 "§" : "§", 270 "¨" : "¨", 271 "©" : "©", 272 "ª" : "ª", 273 "«" : "«", 274 "»" : "»", 275 "¬" : "¬", 276 "­" : "", 277 "®" : "®", 278 "¯" : "¯", 279 "°" : "°", 280 "±" : "±", 281 "²" : "²", 282 "³" : "³", 283 "´" : "´", 284 "µ" : "µ", 285 "¶" : "¶", 286 "·" : "·", 287 "¸" : "¸", 288 "¹" : "¹", 289 "º" : "º", 290 "¼" : "¼", 291 "½" : "½", 292 "¾" : "¾", 293 "¿" : "¿", 294 "À" : "À", 295 "Á" : "Á", 296 "Â" : "Â", 297 "Ã" : "Ã", 298 "Ä" : "Ä", 299 "Å" : "Å", 300 "Æ" : "Æ", 301 "Ç" : "Ç", 302 "È" : "È", 303 "É" : "É", 304 "Ê" : "Ê", 305 "Ë" : "Ë", 306 "Ì" : "Ì", 307 "Í" : "Í", 308 "Î" : "Î", 309 "Ï" : "Ï", 310 "Ð" : "Ð", 311 "Ñ" : "Ñ", 312 "Ò" : "Ò", 313 "Ó" : "Ó", 314 "Ô" : "Ô", 315 "Õ" : "Õ", 316 "Ö" : "Ö", 317 "×" : "×", 318 "Ø" : "Ø", 319 "Ù" : "Ù", 320 "Ú" : "Ú", 321 "Û" : "Û", 322 "Ü" : "Ü", 323 "Ý" : "Ý", 324 "Þ" : "Þ", 325 "ß" : "ß", 326 "à" : "à", 327 "á" : "á", 328 "â" : "â", 329 "ã" : "ã", 330 "ä" : "ä", 331 "å" : "å", 332 "æ" : "æ", 333 "ç" : "ç", 334 "è" : "è", 335 "é" : "é", 336 "ê" : "ê", 337 "ë" : "ë", 338 "ì" : "ì", 339 "í" : "í", 340 "î" : "î", 341 "ï" : "ï", 342 "ð" : "ð", 343 "ñ" : "ñ", 344 "ò" : "ò", 345 "ó" : "ó", 346 "ô" : "ô", 347 "õ" : "õ", 348 "ö" : "ö", 349 "÷" : "÷", 350 "ø" : "ø", 351 "ù" : "ù", 352 "ú" : "ú", 353 "û" : "û", 354 "ü" : "ü", 355 "ý" : "ý", 356 "þ" : "þ", 357 "ÿ" : "ÿ", 358 "ƒ" : "ƒ", 359 "Α" : "Α", 360 "Β" : "Β", 361 "Γ" : "Γ", 362 "Δ" : "Δ", 363 "Ε" : "Ε", 364 "Ζ" : "Ζ", 365 "Η" : "Η", 366 "Θ" : "Θ", 367 "Ι" : "Ι", 368 "Κ" : "Κ", 369 "Λ" : "Λ", 370 "Μ" : "Μ", 371 "Ν" : "Ν", 372 "Ξ" : "Ξ", 373 "Ο" : "Ο", 374 "Π" : "Π", 375 "Ρ" : "Ρ", 376 "Σ" : "Σ", 377 "Τ" : "Τ", 378 "Υ" : "Υ", 379 "Φ" : "Φ", 380 "Χ" : "Χ", 381 "Ψ" : "Ψ", 382 "Ω" : "Ω", 383 "α" : "α", 384 "β" : "β", 385 "γ" : "γ", 386 "δ" : "δ", 387 "ε" : "ε", 388 "ζ" : "ζ", 389 "η" : "η", 390 "θ" : "θ", 391 "ι" : "ι", 392 "κ" : "κ", 393 "λ" : "λ", 394 "μ" : "μ", 395 "ν" : "ν", 396 "ξ" : "ξ", 397 "ο" : "ο", 398 "π" : "π", 399 "ρ" : "ρ", 400 "ς" : "ς", 401 "σ" : "σ", 402 "τ" : "τ", 403 "υ" : "υ", 404 "φ" : "φ", 405 "χ" : "χ", 406 "ψ" : "ψ", 407 "ω" : "ω", 408 "ϑ" : "ϑ", 409 "ϒ" : "ϒ", 410 "ϖ" : "ϖ", 411 "•" : "•", 412 "…" : "…", 413 "′" : "′", 414 "″" : "″", 415 "‾" : "‾", 416 "⁄" : "⁄", 417 "℘" : "℘", 418 "ℑ" : "ℑ", 419 "ℜ" : "ℜ", 420 "™" : "™", 421 "ℵ" : "ℵ", 422 "←" : "←", 423 "↑" : "↑", 424 "→" : "→", 425 "↓" : "↓", 426 "↔" : "↔", 427 "↵" : "↵", 428 "⇐" : "⇐", 429 "⇑" : "⇑", 430 "⇒" : "⇒", 431 "⇓" : "⇓", 432 "⇔" : "⇔", 433 "∀" : "∀", 434 "∂" : "∂", 435 "∃" : "∃", 436 "∅" : "∅", 437 "∇" : "∇", 438 "∈" : "∈", 439 "∉" : "∉", 440 "∋" : "∋", 441 "∏" : "∏", 442 "∑" : "∑", 443 "−" : "−", 444 "∗" : "∗", 445 "√" : "√", 446 "∝" : "∝", 447 "∞" : "∞", 448 "∠" : "∠", 449 "∧" : "∧", 450 "∨" : "∨", 451 "∩" : "∩", 452 "∪" : "∪", 453 "∫" : "∫", 454 "∴" : "∴", 455 "∼" : "∼", 456 "≅" : "≅", 457 "≈" : "≈", 458 "≠" : "≠", 459 "≡" : "≡", 460 "≤" : "≤", 461 "≥" : "≥", 462 "⊂" : "⊂", 463 "⊃" : "⊃", 464 "⊄" : "⊄", 465 "⊆" : "⊆", 466 "⊇" : "⊇", 467 "⊕" : "⊕", 468 "⊗" : "⊗", 469 "⊥" : "⊥", 470 "⋅" : "⋅", 471 "⌈" : "⌈", 472 "⌉" : "⌉", 473 "⌊" : "⌊", 474 "⌋" : "⌋", 475 "⟨" : "〈", 476 "⟩" : "〉", 477 "◊" : "◊", 478 "♠" : "♠", 479 "♣" : "♣", 480 "♥" : "♥", 481 "♦" : "♦", 482 483 "'" : "'", 484 """ : "\"", 485 "&" : "&", 486 "<" : "<", 487 ">" : ">", 488 "Œ" : "Œ", 489 "œ" : "œ", 490 "Š" : "Š", 491 "š" : "š", 492 "Ÿ" : "Ÿ", 493 "ˆ" : "ˆ", 494 "˜" : "˜", 495 "–" : "–", 496 "—" : "—", 497 "‘" : "‘", 498 "’" : "’", 499 "‚" : "‚", 500 "“" : "“", 501 "”" : "”", 502 "„" : "„", 503 "†" : "†", 504 "‡" : "‡", 505 "‰" : "‰", 506 "‹" : "‹", 507 "›" : "›", 508 "€" : "€" 509 ]; 510 foreach(k, v; specialHTMLSymbols) { 511 HTMLText = HTMLText.replace(k, v); 512 } 513 return HTMLText; 514 } 515 516 517 private string extractCharsetFromContentType(string contentType) { 518 string charset; 519 if (!contentType.empty) { 520 auto fields = contentType.split(';'); 521 foreach(field; fields) { 522 field = field.strip; 523 if (field.startsWith("charset=")) { 524 charset = field["charset=".length .. $].idup; 525 break; 526 } 527 } 528 } 529 return charset; 530 } 531