EN
Java - normalize URL to canonical form
4 points
In this short article, we would like to show how to normalize URL in Java to common canonical form.
Normalization form:
xxxxxxxxxx
1
[INPUT] [OUTPUT]
2
3
http://test.com:80 http://test.com
4
https://test.com:80 https://test.com:80
5
http://test.com:443 http://test.com:443
6
https://test.com:443 https://test.com
7
http://test.com/ http://test.com
8
http://test.com? http://test.com
9
http://test.com?a=1&&b=2& http://test.com?a=1&b=2
10
http://test.com# http://test.com
11
http://test.com?b=2&a=1 http://test.com?a=1&b=2
12
http://test.com:80/?# http://test.com
13
https://test.com:443/?# https://test.com
14
https://test.com/users/john/?p=10 https://test.com/users/john?p=10
15
https://test.com/users/john/?p=10# https://test.com/users/john?p=10
16
https://test.com/users/john/?p=10#data https://test.com/users/john?p=10#data
17
https://test.com/users/john/?#data https://test.com/users/john#data
18
https://test.com:8080/users/john/?#data https://test.com:8080/users/john#data
19
https://test.com:8080//users//john//?#data https://test.com:8080/users/john#data
20
https://test.com:443//?# https://test.com
Program.java
file:
xxxxxxxxxx
1
package example.utils;
2
3
import java.net.URISyntaxException;
4
5
public final class UrlUtils {
6
7
public static void main(String[] args) throws URISyntaxException {
8
9
printNormalized("http://test.com:80"); // http://test.com
10
printNormalized("https://test.com:80"); // https://test.com:80
11
printNormalized("http://test.com:443"); // http://test.com:443
12
printNormalized("https://test.com:443"); // https://test.com
13
printNormalized("http://test.com/"); // http://test.com
14
printNormalized("http://test.com?"); // http://test.com
15
printNormalized("http://test.com?a=1&&b=2&"); // http://test.com?a=1&b=2
16
printNormalized("http://test.com#"); // http://test.com
17
printNormalized("http://test.com?b=2&a=1"); // http://test.com?a=1&b=2
18
printNormalized("http://test.com:80/?#"); // http://test.com
19
printNormalized("https://test.com:443/?#"); // https://test.com
20
printNormalized("https://test.com/users/john/?p=10"); // https://test.com/users/john?p=10
21
printNormalized("https://test.com/users/john/?p=10#"); // https://test.com/users/john?p=10
22
printNormalized("https://test.com/users/john/?p=10#data"); // https://test.com/users/john?p=10#data
23
printNormalized("https://test.com/users/john/?#data"); // https://test.com/users/john#data
24
printNormalized("https://test.com:8080/users/john/?#data"); // https://test.com:8080/users/john#data
25
printNormalized("https://test.com:8080//users//john//?#data"); // https://test.com:8080/users/john#data
26
printNormalized("https://test.com:443//?#"); // https://test.com
27
}
28
29
public static void printNormalized(String url) throws URISyntaxException {
30
System.out.println(UrlUtils.normalizeUrl(url));
31
}
32
}
UrlUtils.java
file:
xxxxxxxxxx
1
package example.utils;
2
3
import java.net.URI;
4
import java.net.URISyntaxException;
5
import java.util.Arrays;
6
7
public final class UrlUtils {
8
9
private UrlUtils() {
10
// nothing here ...
11
}
12
13
public static String normalizeUrl(String url) throws URISyntaxException {
14
URI uri = new URI(url);
15
16
String scheme = uri.getScheme();
17
18
if (scheme == null) {
19
throw new RuntimeException("URL scheme is required.");
20
}
21
22
String user = uri.getUserInfo();
23
String host = uri.getHost();
24
25
int port = normalizePort(scheme, uri.getPort());
26
String path = normalizePath(uri.getPath());
27
String query = normalizeQuery(uri.getQuery());
28
String fragment = normalizeFragment(uri.getFragment());
29
30
URI result = new URI(scheme, user, host, port, path, query, fragment);
31
32
return result.toString();
33
}
34
35
private static int normalizePort(String scheme, int port) {
36
switch (port) {
37
case 80:
38
if ("http".equals(scheme)) {
39
return -1;
40
}
41
break;
42
43
case 443:
44
if ("https".equals(scheme)) {
45
return -1;
46
}
47
break;
48
}
49
return port;
50
}
51
52
private static String normalizePath(String path) {
53
String result = removeDuplicates(path, '/');
54
if (result == null || result.isEmpty()) {
55
return null;
56
}
57
int length = result.length();
58
char value = result.charAt(length - 1);
59
if (value == '/') {
60
return result.substring(0, length - 1);
61
}
62
return result;
63
}
64
65
private static String normalizeQuery(String query) {
66
if (query == null || query.isEmpty()) {
67
return null;
68
}
69
String[] parts = query.split("&");
70
if (parts.length > 1) {
71
Arrays.sort(parts);
72
StringBuilder builder = new StringBuilder();
73
for (int i = 0; i < parts.length; ++i) {
74
String part = parts[i];
75
if (part.isEmpty()) {
76
continue;
77
}
78
int length = builder.length();
79
if (length > 0) {
80
builder.append("&");
81
}
82
builder.append(part);
83
}
84
return builder.toString();
85
}
86
return query;
87
}
88
89
private static String normalizeFragment(String fragment) {
90
if (fragment == null || fragment.isEmpty()) {
91
return null;
92
}
93
return fragment;
94
}
95
96
private static String removeDuplicates(String text, char character) {
97
if (text == null || text.isEmpty()) {
98
return text;
99
}
100
StringBuilder builder = new StringBuilder();
101
int duplicatesCount = 0;
102
int textLength = text.length();
103
for (int i = 0; i < textLength; ++i) {
104
char value = text.charAt(i);
105
if (value == character) {
106
duplicatesCount += 1;
107
if (duplicatesCount > 1) {
108
continue;
109
}
110
} else {
111
duplicatesCount = 0;
112
}
113
builder.append(value);
114
}
115
return builder.toString();
116
}
117
}