Languages
[Edit]
EN

Java - normalize URL to canonical form

4 points
Created by:
Kara
541

In this short article, we would like to show how to normalize URL in Java to common canonical form.

Normalization form:

[INPUT]                                       [OUTPUT]

http://test.com:80                            http://test.com
https://test.com:80                           https://test.com:80
http://test.com:443                           http://test.com:443
https://test.com:443                          https://test.com
http://test.com/                              http://test.com
http://test.com?                              http://test.com
http://test.com?a=1&&b=2&                     http://test.com?a=1&b=2
http://test.com#                              http://test.com
http://test.com?b=2&a=1                       http://test.com?a=1&b=2
http://test.com:80/?#                         http://test.com
https://test.com:443/?#                       https://test.com
https://test.com/users/john/?p=10             https://test.com/users/john?p=10
https://test.com/users/john/?p=10#            https://test.com/users/john?p=10
https://test.com/users/john/?p=10#data        https://test.com/users/john?p=10#data
https://test.com/users/john/?#data            https://test.com/users/john#data
https://test.com:8080/users/john/?#data       https://test.com:8080/users/john#data
https://test.com:8080//users//john//?#data    https://test.com:8080/users/john#data
https://test.com:443//?#                      https://test.com

 

Program.java file:

package example.utils;

import java.net.URISyntaxException;

public final class UrlUtils {

    public static void main(String[] args) throws URISyntaxException {

        printNormalized("http://test.com:80");                         // http://test.com
        printNormalized("https://test.com:80");                        // https://test.com:80
        printNormalized("http://test.com:443");                        // http://test.com:443
        printNormalized("https://test.com:443");                       // https://test.com
        printNormalized("http://test.com/");                           // http://test.com
        printNormalized("http://test.com?");                           // http://test.com
        printNormalized("http://test.com?a=1&&b=2&");                  // http://test.com?a=1&b=2
        printNormalized("http://test.com#");                           // http://test.com
        printNormalized("http://test.com?b=2&a=1");                    // http://test.com?a=1&b=2
        printNormalized("http://test.com:80/?#");                      // http://test.com
        printNormalized("https://test.com:443/?#");                    // https://test.com
        printNormalized("https://test.com/users/john/?p=10");          // https://test.com/users/john?p=10
        printNormalized("https://test.com/users/john/?p=10#");         // https://test.com/users/john?p=10
        printNormalized("https://test.com/users/john/?p=10#data");     // https://test.com/users/john?p=10#data
        printNormalized("https://test.com/users/john/?#data");         // https://test.com/users/john#data
        printNormalized("https://test.com:8080/users/john/?#data");    // https://test.com:8080/users/john#data
        printNormalized("https://test.com:8080//users//john//?#data"); // https://test.com:8080/users/john#data
        printNormalized("https://test.com:443//?#");                   // https://test.com
    }

    public static void printNormalized(String url) throws URISyntaxException {
        System.out.println(UrlUtils.normalizeUrl(url));
    }
}

UrlUtils.java file:

package example.utils;

import java.net.URI;
import java.net.URISyntaxException;
import java.util.Arrays;

public final class UrlUtils {

    private UrlUtils() {
        // nothing here ...
    }

    public static String normalizeUrl(String url) throws URISyntaxException {
        URI uri = new URI(url);

        String scheme = uri.getScheme();

        if (scheme == null) {
            throw new RuntimeException("URL scheme is required.");
        }

        String user = uri.getUserInfo();
        String host = uri.getHost();

        int port = normalizePort(scheme, uri.getPort());
        String path = normalizePath(uri.getPath());
        String query = normalizeQuery(uri.getQuery());
        String fragment = normalizeFragment(uri.getFragment());

        URI result = new URI(scheme, user, host, port, path, query, fragment);

        return result.toString();
    }

    private static int normalizePort(String scheme, int port) {
        switch (port) {
            case 80:
                if ("http".equals(scheme)) {
                    return -1;
                }
                break;

            case 443:
                if ("https".equals(scheme)) {
                    return -1;
                }
                break;
        }
        return port;
    }

    private static String normalizePath(String path) {
        String result = removeDuplicates(path, '/');
        if (result == null || result.isEmpty()) {
            return null;
        }
        int length = result.length();
        char value = result.charAt(length - 1);
        if (value == '/') {
            return result.substring(0, length - 1);
        }
        return result;
    }

    private static String normalizeQuery(String query) {
        if (query == null || query.isEmpty()) {
            return null;
        }
        String[] parts = query.split("&");
        if (parts.length > 1) {
            Arrays.sort(parts);
            StringBuilder builder = new StringBuilder();
            for (int i = 0; i < parts.length; ++i) {
                String part = parts[i];
                if (part.isEmpty()) {
                    continue;
                }
                int length = builder.length();
                if (length > 0) {
                    builder.append("&");
                }
                builder.append(part);
            }
            return builder.toString();
        }
        return query;
    }

    private static String normalizeFragment(String fragment) {
        if (fragment == null || fragment.isEmpty()) {
            return null;
        }
        return fragment;
    }

    private static String removeDuplicates(String text, char character) {
        if (text == null || text.isEmpty()) {
            return text;
        }
        StringBuilder builder = new StringBuilder();
        int duplicatesCount = 0;
        int textLength = text.length();
        for (int i = 0; i < textLength; ++i) {
            char value = text.charAt(i);
            if (value == character) {
                duplicatesCount += 1;
                if (duplicatesCount > 1) {
                    continue;
                }
            } else {
                duplicatesCount = 0;
            }
            builder.append(value);
        }
        return builder.toString();
    }
}

Alternative titles

  1. Java - URL normalization util
  2. Java - remove neutral characters from URL (normalization)
Donate to Dirask
Our content is created by volunteers - like Wikipedia. If you think, the things we do are good, donate us. Thanks!
Join to our subscribers to be up to date with content, news and offers.
Native Advertising
🚀
Get your tech brand or product in front of software developers.
For more information Contact us
Dirask - we help you to
solve coding problems.
Ask question.

❤️💻 🙂

Join