amalthea.net source code

1 /* This file is part of the Amalthea library.
2  *
3  * Copyright (C) 2018-2021 Eugene 'Vindex' Stulin
4  *
5  * Distributed under the Boost Software License 1.0 or (at your option)
6  * the GNU Lesser General Public License 3.0 or later.
7  */
8 
9 module amalthea.net;
10 
11 public import amalthea.libcore;
12 
13 import amalthea.dataprocessing,
14        amalthea.encoding,
15        amalthea.fs;
16 
17 import std.algorithm, std.format, std.range, std.regex, std.string;
18 import std.net.curl;
19 
20 static alias stdGet  = std.net.curl.get;
21 static alias stdPost = std.net.curl.post;
22 
23 alias urlEncode = amalthea.dataprocessing.urlEncode;
24 
25 version (DigitalMars)
26 alias StdNetException = std.net.curl.HTTPStatusException;
27 else
28 alias StdNetException = std.net.curl.CurlException;
29 
30 
31 /*******************************************************************************
32  * Exception for common errors in this module.
33  */
34 class AmaltheaNetException : Exception {
35     this(string msg, string file = __FILE__, size_t line = __LINE__) {
36         super(msg, file, line);
37     }
38 }
39 
40 
41 /*******************************************************************************
42  * GET request to raw content.
43  * Returns: Byte array.
44  */
45 ubyte[] getRaw(string url) {
46     auto client = HTTP();
47     client.verifyPeer = false;
48     client.method = HTTP.Method.get;
49     client.url = url;
50     HTTP.StatusLine statusLine;
51     import std.array : appender;
52     auto content = appender!(ubyte[])();
53     client.onReceive = (ubyte[] data) {
54         content ~= data;
55         return data.length;
56     };
57     client.onReceiveStatusLine = (HTTP.StatusLine l) { statusLine = l; };
58     client.perform();
59     if (statusLine.code / 100 != 2) {
60         string exceptionMsg = format(
61             "HTTP request returned status code %d (%s)",
62             statusLine.code, statusLine.reason
63         );
64         throw new HTTPStatusException(statusLine.code, exceptionMsg);
65     }
66     return content.data;
67 }
68 
69 
70 /*******************************************************************************
71  * POST request.
72  */
73 T[] post(T = char)(const(char)[] url,
74                    string[string] postDict,
75                    HTTP conn = HTTP())
76 if (is(T == char) || is(T == ubyte)) {
77     return stdPost!T(url, urlEncode(postDict), conn);
78 }
79 
80 
81 /*******************************************************************************
82  * POST request to get an answer with raw content.
83  * Returns: Byte array.
84  */
85 ubyte[] postRaw(const(char)[] url,
86                 string[string] postDict,
87                 HTTP conn = HTTP()) {
88     return amalthea.net.post!ubyte(url, postDict, conn);
89 }
90 
91 
92 /*******************************************************************************
93  * Gets HTTP response headers by URL.
94  */
95 string[string] getHeaders(in char[] url) {
96     auto http = HTTP(url);
97     http.perform;
98     return http.responseHeaders;
99 }
100 
101 
102 /*******************************************************************************
103  * Gets HTTP content type by URL.
104  */
105 string getContentType(in char[] url) {
106     auto header = getHeaders(url);
107     string contentType;
108     foreach(key, value; header) {
109         if (key.toLower == "content-type") {
110             contentType = value;
111             break;
112         }
113     }
114     return contentType;
115 }
116 
117 
118 /*******************************************************************************
119  * Gets content charset (possibly empty) by URL.
120  */
121 string getCharset(const(char)[] url) {
122     string contentType = getContentType(url);
123     return extractCharsetFromContentType(contentType);
124 }
125 
126 
127 /*******************************************************************************
128  * Checks if the URL is a link to HTML page.
129  */
130 bool isLinkToHTML(string url) {
131     try {
132         string contentType = getContentType(url);
133         if (contentType.canFind("text/html")) {
134             return true;
135         }
136     } catch(Exception e) {
137         return false;
138     }
139     return false;
140 }
141 
142 
143 /*******************************************************************************
144  * Get text content as amalthea.encoding.UniString by URL.
145  * The implementation of 'get' from the standard library is taken as a basis.
146  */
147 UniString getPage(string url) {
148     auto client = HTTP();
149     client.verifyPeer = false;
150     client.method = HTTP.Method.get;
151     client.url = url;
152     HTTP.StatusLine statusLine;
153     import std.array : appender;
154     auto content = appender!(ubyte[])();
155     client.onReceive = (ubyte[] data) {
156         content ~= data;
157         return data.length;
158     };
159     string contentType;
160     string charset;
161     client.onReceiveHeader = (in char[] key,
162                               in char[] value) {
163         auto lowerKey = key.idup.toLower;
164         if (lowerKey == "content-length") {
165             import std.conv : to;
166             content.reserve(value.to!size_t);
167         } else if (lowerKey == "content-type") {
168             contentType = value.idup;
169             charset = extractCharsetFromContentType(contentType);
170         }
171     };
172     client.onReceiveStatusLine = (HTTP.StatusLine l) { statusLine = l; };
173     client.perform();
174 
175     if (statusLine.code / 100 != 2) {
176         string exceptionMsg = format(
177             "HTTP request returned status code %d (%s)",
178             statusLine.code, statusLine.reason
179         );
180         throw new HTTPStatusException(statusLine.code, exceptionMsg);
181     }
182     if (charset.empty) {
183         charset = "UTF-8";
184     }
185     auto page = UniString(charset, content.data);
186     return page;
187 }
188 
189 
190 
191 /*******************************************************************************
192  * This function searches all elements from HTML page with a specific tag.
193  */
194 auto getElementsByTag(string html, string tag) {
195     return getElementsByTagAndAttribute(html, tag);
196 }
197 
198 
199 /*******************************************************************************
200  * This function searches all elements by a specific tag and an attribute.
201  * Returns: Page element info (tag name and possible attributes)
202  *          and content of the element as tuple with two elements.
203  */
204 auto getElementsByTagAndAttribute(string html,
205                                   string tag,
206                                   string attrName = "",
207                                   string attrValue = "") {
208     html = html.replace("\n", " ");
209     auto lowerTag = tag.toLower;
210     auto upperTag = tag.toUpper;
211     html = html.replace("<"~upperTag~" ", "\n<"~lowerTag~" ")
212                .replace("<"~upperTag~">", "\n<"~lowerTag~">")
213                .replace("</"~upperTag~" ", "</"~lowerTag~" \n")
214                .replace("</"~upperTag~">", "</"~lowerTag~">\n");
215     html = html.replace("<"~lowerTag~" ", "\n<"~lowerTag~" ")
216                .replace("<"~lowerTag~">", "\n<"~lowerTag~">")
217                .replace("</"~lowerTag~" ", "</"~lowerTag~" \n")
218                .replace("</"~lowerTag~">", "</"~lowerTag~">\n");
219     if (!attrValue.empty) attrValue = ` *= *"?` ~ attrValue ~ `"?`;
220     string openingTag, closingTag;
221     openingTag = attrName.empty ? tag : tag~" ";
222     closingTag = tag;
223     string e;
224     e = format!`^<(?P<declaration>%s[^>]*%s%s *[^>]*)>(?P<content>.*)</%s *>$`
225         (openingTag, attrName, attrValue, closingTag);
226     bool pairedTag = true;
227     auto r = regex(e, "im");
228     if (count(matchAll(html, r)) == 0) {
229         e = format!`^<(?P<declaration>%s [^>]*%s%s *[^>]*)>`
230                    (tag, attrName, attrValue);
231         pairedTag = false;
232         r = regex(e, "im");
233     }
234     Tuple!(string, "declaration", string, "content")[] elements;
235     foreach(c; matchAll(html, r)) {
236         auto decl = c["declaration"].replace("\t", " ");
237         decl = decl.removeDuplicateConsecutiveSubstring(" ");
238         elements ~= Tuple!(string, "declaration", string, "content")
239                           (decl, pairedTag ? c["content"] : "");
240     }
241     return elements;
242 }
243 
244 
245 /*******************************************************************************
246  * This function returns title of Internet-page.
247  */
248 string getHTMLPageTitle(string address) {
249     string html = getPage(address).toString;
250     html.replaceSpecialMnemonics;
251     auto res = getElementsByTag(html, "title");
252     if (res.empty) return "";
253     return res[0].content;
254 }
255 
256 
257 /*******************************************************************************
258  * Search and replace special characters in HTML for normal view.
259  */
260 ref string replaceSpecialMnemonics(return ref string HTMLText) {
261     import std.string;
262     string[string] specialHTMLSymbols = [
263         "&iexcl;"    : "¡",
264         "&cent;"     : "¢",
265         "&pound;"    : "£",
266         "&curren;"   : "¤",
267         "&yen;"      : "¥",
268         "&brvbar;"   : "¦",
269         "&sect;"     : "§",
270         "&uml;"      : "¨",
271         "&copy;"     : "©",
272         "&ordf;"     : "ª",
273         "&laquo;"    : "«",
274         "&raquo;"    : "»",
275         "&not;"      : "¬",
276         "&shy;"      : "",
277         "&reg;"      : "®",
278         "&macr;"     : "¯",
279         "&deg;"      : "°",
280         "&plusmn;"   : "±",
281         "&sup2;"     : "²",
282         "&sup3;"     : "³",
283         "&acute;"    : "´",
284         "&micro;"    : "µ",
285         "&para;"     : "¶",
286         "&middot;"   : "·",
287         "&cedil;"    : "¸",
288         "&sup1;"     : "¹",
289         "&ordm;"     : "º",
290         "&frac14;"   : "¼",
291         "&frac12;"   : "½",
292         "&frac34;"   : "¾",
293         "&iquest;"   : "¿",
294         "&Agrave;"   : "À",
295         "&Aacute;"   : "Á",
296         "&Acirc;"    : "Â",
297         "&Atilde;"   : "Ã",
298         "&Auml;"     : "Ä",
299         "&Aring;"    : "Å",
300         "&AElig;"    : "Æ",
301         "&Ccedil;"   : "Ç",
302         "&Egrave;"   : "È",
303         "&Eacute;"   : "É",
304         "&Ecirc;"    : "Ê",
305         "&Euml;"     : "Ë",
306         "&Igrave;"   : "Ì",
307         "&Iacute;"   : "Í",
308         "&Icirc;"    : "Î",
309         "&Iuml;"     : "Ï",
310         "&ETH;"      : "Ð",
311         "&Ntilde;"   : "Ñ",
312         "&Ograve;"   : "Ò",
313         "&Oacute;"   : "Ó",
314         "&Ocirc;"    : "Ô",
315         "&Otilde;"   : "Õ",
316         "&Ouml;"     : "Ö",
317         "&times;"    : "×",
318         "&Oslash;"   : "Ø",
319         "&Ugrave;"   : "Ù",
320         "&Uacute;"   : "Ú",
321         "&Ucirc;"    : "Û",
322         "&Uuml;"     : "Ü",
323         "&Yacute;"   : "Ý",
324         "&THORN;"    : "Þ",
325         "&szlig;"    : "ß",
326         "&agrave;"   : "à",
327         "&aacute;"   : "á",
328         "&acirc;"    : "â",
329         "&atilde;"   : "ã",
330         "&auml;"     : "ä",
331         "&aring;"    : "å",
332         "&aelig;"    : "æ",
333         "&ccedil;"   : "ç",
334         "&egrave;"   : "è",
335         "&eacute;"   : "é",
336         "&ecirc;"    : "ê",
337         "&euml;"     : "ë",
338         "&igrave;"   : "ì",
339         "&iacute;"   : "í",
340         "&icirc;"    : "î",
341         "&iuml;"     : "ï",
342         "&eth;"      : "ð",
343         "&ntilde;"   : "ñ",
344         "&ograve;"   : "ò",
345         "&oacute;"   : "ó",
346         "&ocirc;"    : "ô",
347         "&otilde;"   : "õ",
348         "&ouml;"     : "ö",
349         "&divide;"   : "÷",
350         "&oslash;"   : "ø",
351         "&ugrave;"   : "ù",
352         "&uacute;"   : "ú",
353         "&ucirc;"    : "û",
354         "&uuml;"     : "ü",
355         "&yacute;"   : "ý",
356         "&thorn;"    : "þ",
357         "&yuml;"     : "ÿ",
358         "&fnof;"     : "ƒ",
359         "&Alpha;"    : "Α",
360         "&Beta;"     : "Β",
361         "&Gamma;"    : "Γ",
362         "&Delta;"    : "Δ",
363         "&Epsilon;"  : "Ε",
364         "&Zeta;"     : "Ζ",
365         "&Eta;"      : "Η",
366         "&Theta;"    : "Θ",
367         "&Iota;"     : "Ι",
368         "&Kappa;"    : "Κ",
369         "&Lambda;"   : "Λ",
370         "&Mu;"       : "Μ",
371         "&Nu;"       : "Ν",
372         "&Xi;"       : "Ξ",
373         "&Omicron;"  : "Ο",
374         "&Pi;"       : "Π",
375         "&Rho;"      : "Ρ",
376         "&Sigma;"    : "Σ",
377         "&Tau;"      : "Τ",
378         "&Upsilon;"  : "Υ",
379         "&Phi;"      : "Φ",
380         "&Chi;"      : "Χ",
381         "&Psi;"      : "Ψ",
382         "&Omega;"    : "Ω",
383         "&alpha;"    : "α",
384         "&beta;"     : "β",
385         "&gamma;"    : "γ",
386         "&delta;"    : "δ",
387         "&epsilon;"  : "ε",
388         "&zeta;"     : "ζ",
389         "&eta;"      : "η",
390         "&theta;"    : "θ",
391         "&iota;"     : "ι",
392         "&kappa;"    : "κ",
393         "&lambda;"   : "λ",
394         "&mu;"       : "μ",
395         "&nu;"       : "ν",
396         "&xi;"       : "ξ",
397         "&omicron;"  : "ο",
398         "&pi;"       : "π",
399         "&rho;"      : "ρ",
400         "&sigmaf;"   : "ς",
401         "&sigma;"    : "σ",
402         "&tau;"      : "τ",
403         "&upsilon;"  : "υ",
404         "&phi;"      : "φ",
405         "&chi;"      : "χ",
406         "&psi;"      : "ψ",
407         "&omega;"    : "ω",
408         "&thetasym;" : "ϑ",
409         "&upsih;"    : "ϒ",
410         "&piv;"      : "ϖ",
411         "&bull;"     : "•",
412         "&hellip;"   : "…",
413         "&prime;"    : "′",
414         "&Prime;"    : "″",
415         "&oline;"    : "‾",
416         "&frasl;"    : "⁄",
417         "&weierp;"   : "℘",
418         "&image;"    : "ℑ",
419         "&real;"     : "ℜ",
420         "&trade;"    : "™",
421         "&alefsym;"  : "ℵ",
422         "&larr;"     : "←",
423         "&uarr;"     : "↑",
424         "&rarr;"     : "→",
425         "&darr;"     : "↓",
426         "&harr;"     : "↔",
427         "&crarr;"    : "↵",
428         "&lArr;"     : "⇐",
429         "&uArr;"     : "⇑",
430         "&rArr;"     : "⇒",
431         "&dArr;"     : "⇓",
432         "&hArr;"     : "⇔",
433         "&forall;"   : "∀",
434         "&part;"     : "∂",
435         "&exist;"    : "∃",
436         "&empty;"    : "∅",
437         "&nabla;"    : "∇",
438         "&isin;"     : "∈",
439         "&notin;"    : "∉",
440         "&ni;"       : "∋",
441         "&prod;"     : "∏",
442         "&sum;"      : "∑",
443         "&minus;"    : "−",
444         "&lowast;"   : "∗",
445         "&radic;"    : "√",
446         "&prop;"     : "∝",
447         "&infin;"    : "∞",
448         "&ang;"      : "∠",
449         "&and;"      : "∧",
450         "&or;"       : "∨",
451         "&cap;"      : "∩",
452         "&cup;"      : "∪",
453         "&int;"      : "∫",
454         "&there4;"   : "∴",
455         "&sim;"      : "∼",
456         "&cong;"     : "≅",
457         "&asymp;"    : "≈",
458         "&ne;"       : "≠",
459         "&equiv;"    : "≡",
460         "&le;"       : "≤",
461         "&ge;"       : "≥",
462         "&sub;"      : "⊂",
463         "&sup;"      : "⊃",
464         "&nsub;"     : "⊄",
465         "&sube;"     : "⊆",
466         "&supe;"     : "⊇",
467         "&oplus;"    : "⊕",
468         "&otimes;"   : "⊗",
469         "&perp;"     : "⊥",
470         "&sdot;"     : "⋅",
471         "&lceil;"    : "⌈",
472         "&rceil;"    : "⌉",
473         "&lfloor;"   : "⌊",
474         "&rfloor;"   : "⌋",
475         "&lang;"     : "〈",
476         "&rang;"     : "〉",
477         "&loz;"      : "◊",
478         "&spades;"   : "♠",
479         "&clubs;"    : "♣",
480         "&hearts;"   : "♥",
481         "&diams;"    : "♦",
482 
483         "&apos;"     : "'",
484         "&quot;"     : "\"",
485         "&amp;"      : "&",
486         "&lt;"       : "<",
487         "&gt;"       : ">",
488         "&OElig;"    : "Œ",
489         "&oelig;"    : "œ",
490         "&Scaron;"   : "Š",
491         "&scaron;"   : "š",
492         "&Yuml;"     : "Ÿ",
493         "&circ;"     : "ˆ",
494         "&tilde;"    : "˜",
495         "&ndash;"    : "–",
496         "&mdash;"    : "—",
497         "&lsquo;"    : "‘",
498         "&rsquo;"    : "’",
499         "&sbquo;"    : "‚",
500         "&ldquo;"    : "“",
501         "&rdquo;"    : "”",
502         "&bdquo;"    : "„",
503         "&dagger;"   : "†",
504         "&Dagger;"   : "‡",
505         "&permil;"   : "‰",
506         "&lsaquo;"   : "‹",
507         "&rsaquo;"   : "›",
508         "&euro;"     : "€"
509     ];
510     foreach(k, v; specialHTMLSymbols) {
511         HTMLText = HTMLText.replace(k, v);
512     }
513     return HTMLText;
514 }
515 
516 
517 private string extractCharsetFromContentType(string contentType) {
518     string charset;
519     if (!contentType.empty) {
520         auto fields = contentType.split(';');
521         foreach(field; fields) {
522             field = field.strip;
523             if (field.startsWith("charset=")) {
524                 charset = field["charset=".length .. $].idup;
525                 break;
526             }
527         }
528     }
529     return charset;
530 }
531