1 /**
2 * Parses the URL-host (not the url)
3 * e.g. www.github.com , lb1.www.some-cool-domain.co.uk , 127.0.0.1 , 2001:0db8:0:0:0:0:1428:57ab
4 *
5 * Copyright:
6 * (C) 2016 Martin Brzenska
7 *
8 * License:
9 * Distributed under the terms of the MIT license.
10 * Consult the provided LICENSE.md file for details
11 */
12 module libhosttokens.host;
13 
14 
15 /**
16 * A parsed hostname.
17 */
18 struct Host
19 {
20   ///The original hostname.
21   string host;
22   ///A list of subdomains.
23   string[] subdomains;
24   ///The part of the domain between the subdomain and tld/ccSLD.
25   string lowlevelDomain;
26   ///A list of TLD or ccSLD and TLD.
27   string[] reglevels = [];
28   ///True if the hostname is a IP (IPv4 or IPv6).
29   bool isIP;
30 
31 
32   string toString() {
33     return this.host;
34   }
35 
36   ///The TLD or ccSLD.TLD.
37   @property tld() {
38     import std.array : join;
39     return this.reglevels.join(".");
40   }
41 
42   ///The part of a hostname, that is before (right of) the subdomains.
43   @property paylevelDomain() {
44     return (this.lowlevelDomain.length ? this.lowlevelDomain ~ ( this.isIP ? "" : ".") : "") ~ this.tld;
45   }
46 
47   ///The part of a hostname, that is after (left of) the paylevelDomain.
48   @property subdomain() {
49     import std.array : join;
50     return this.subdomains.join(".");
51   }
52 
53 }
54 
55 /**
56 * Parses a hostname
57 * Params:
58 *   host = the Hostname to be parsed
59 *
60 * Returns: A Host struct containing the the hostname elements (subdomain , paylevelDomain , tld ...).
61 */
62 immutable(Host) parseHost(string host) {
63   import std.array : split;
64   import std.algorithm.searching : find;
65   import std.algorithm.mutation : reverse;
66   import std.socket : parseAddress , Address , SocketException;
67 
68   import libhosttokens.ccSLD : ccSLDs;
69   
70   string[] sHost_subdomains;
71   string sHost_lowlevelDomain;
72   string[] sHost_reglevels;
73   bool sHost_isIP;
74 
75   bool isIPaddr = true;
76   Address addr;
77   try
78   {
79     addr = parseAddress(host);
80   }
81   catch(SocketException e) {
82     isIPaddr = false;
83   }
84 
85   if(isIPaddr) {
86     sHost_isIP = true;
87     sHost_lowlevelDomain = addr.toAddrString();
88     return immutable(Host)(
89       host,
90       [],
91       sHost_lowlevelDomain,
92       [],
93       sHost_isIP
94     );
95   }
96 
97   auto arrHost = split(host , ".");
98   arrHost.reverse();
99 
100   //Parse TLD/ccSLD
101   string ccSLD;
102   size_t lastLevel;
103   foreach(size_t level , string domain ; arrHost) {
104     lastLevel = level;
105     if( level == 0 && domain !in ccSLDs) {
106       sHost_reglevels ~= domain;
107       break;
108     }
109     else if( level == 0 && domain in ccSLDs) {
110       ccSLD = domain;
111       sHost_reglevels ~= ccSLD;
112     }
113     else if( level == 1 && ccSLDs[ccSLD].find(domain)) {
114       sHost_reglevels ~= domain;
115       break;
116     }
117   }
118   sHost_reglevels.reverse();
119 
120   //Paydomain
121   sHost_lowlevelDomain = arrHost[++lastLevel];
122 
123   //Subdomains
124   for(size_t i = ++lastLevel ; i < arrHost.length ; i++) {
125     sHost_subdomains ~= arrHost[i];
126   }
127   sHost_subdomains.reverse();
128 
129   return immutable(Host)(
130     host,
131     sHost_subdomains.idup,
132     sHost_lowlevelDomain,
133     sHost_reglevels.idup,
134     sHost_isIP
135   );
136 }
137 
138 unittest {
139 
140   auto host = parseHost("profil.mab-on.net");
141   assert(host.lowlevelDomain == "mab-on");
142   assert(host.tld == "net");
143   assert(host.subdomain == "profil");
144   assert(host.paylevelDomain == "mab-on.net");
145 
146   host = parseHost("www.amazon.co.uk");
147   assert(host.lowlevelDomain == "amazon");
148   assert(host.tld == "co.uk");
149   assert(host.subdomain == "www");
150   assert(host.paylevelDomain == "amazon.co.uk");
151 
152   host = parseHost("www.herts.police.uk");
153   assert(host.lowlevelDomain == "herts");
154   assert(host.tld == "police.uk");
155   assert(host.subdomain == "www");
156   assert(host.paylevelDomain == "herts.police.uk");
157 
158   host = parseHost("www.ub.uni-koeln.de");
159   assert(host.lowlevelDomain == "uni-koeln");
160   assert(host.tld == "de");
161   assert(host.subdomain == "www.ub");
162   assert(host.paylevelDomain == "uni-koeln.de");
163 
164   host = parseHost("127.0.0.1");
165   assert(host.lowlevelDomain == "127.0.0.1" , host.lowlevelDomain);
166   assert(host.tld == "");
167   assert(host.subdomain == "");
168   assert(host.paylevelDomain == "127.0.0.1");
169 
170   import std.format : format;
171   host = parseHost("2001:0db8:85a3:08d3:1319:8a2e:0370:7344");
172   assert(host.lowlevelDomain == "2001:db8:85a3:8d3:1319:8a2e:370:7344" , host.lowlevelDomain);
173   //Note, that Host.host is the original IPv6 String - other properties shortens the address.
174   assert(format("%s",host) == "2001:0db8:85a3:08d3:1319:8a2e:0370:7344");
175   assert(host.tld == "");
176   assert(host.subdomain == "");
177   assert(host.paylevelDomain == "2001:db8:85a3:8d3:1319:8a2e:370:7344");
178 }