diff --git a/Sitemap.class b/Sitemap.class new file mode 100644 index 0000000..3c3b63e Binary files /dev/null and b/Sitemap.class differ diff --git a/Sitemap.java b/Sitemap.java new file mode 100644 index 0000000..1cc9869 --- /dev/null +++ b/Sitemap.java @@ -0,0 +1,229 @@ +import java.io.*; +import java.net.*; +import java.util.*; +import java.util.concurrent.*; +import java.util.regex.*; +import java.util.zip.GZIPOutputStream; + +public class Sitemap { + + private static final Set visited = ConcurrentHashMap.newKeySet(); + private static final Set enqueued = ConcurrentHashMap.newKeySet(); + private static final BlockingQueue queue = new LinkedBlockingQueue<>(); + + private static final String[] SKIP_EXT = { + ".pdf", ".zip", ".rar", ".7z", ".tar", ".gz", + ".exe", ".dmg", ".iso", + ".mp4", ".mkv", ".avi", ".mov", + ".mp3", ".wav", + ".jpg", ".jpeg", ".png", ".gif", ".webp", + ".apk", ".msi", + ".bundle", + ".jar", ".patch" + }; + + private static final int MAX_PAGES = 5_000_000; + private static final int WRITE_CHUNK = 50; + + private static String domain; + + // ---------------- OUTPUT STREAM (GZIP) ---------------- + private static final Object fileLock = new Object(); + private static GZIPOutputStream gzipOut; + private static BufferedWriter writer; + private static int bufferCount = 0; + + private static final Object logLock = new Object(); + private static final String LOG_FILE = "crawl.log"; + + public static void main(String[] args) throws Exception { + + if (args.length == 0) { + System.out.println("Usage: java Sitemap https://example.com"); + return; + } + + String startUrl = normalize(args[0]); + domain = new URL(startUrl).getHost(); + + gzipOut = new GZIPOutputStream(new FileOutputStream("sitemap.txt.gz")); + writer = new BufferedWriter(new OutputStreamWriter(gzipOut)); + + safeAdd(startUrl); + + int THREADS = Runtime.getRuntime().availableProcessors() * 2; + System.out.println("Using " + THREADS + " threads"); + + ExecutorService executor = Executors.newFixedThreadPool(THREADS); + + Runnable worker = () -> { + while (visited.size() < MAX_PAGES) { + + String url = queue.poll(); + if (url == null) return; + + url = normalize(url); + + if (!visited.add(url)) continue; + + if (isAsset(url)) { + log("SKIP", url); + continue; + } + + writeChunk(url); + + try { + HttpURLConnection conn = (HttpURLConnection) new URL(url).openConnection(); + + conn.setRequestMethod("GET"); + conn.setConnectTimeout(1000); + conn.setReadTimeout(1000); + conn.setRequestProperty("User-Agent", "SitemapBot"); + + String type = conn.getContentType(); + if (type == null || !type.contains("text/html")) { + log("SKIP", url + " (non-html)"); + continue; + } + + BufferedReader in = new BufferedReader( + new InputStreamReader(conn.getInputStream()) + ); + + StringBuilder html = new StringBuilder(); + String line; + + while ((line = in.readLine()) != null) { + html.append(line); + } + + in.close(); + + Matcher m = Pattern.compile("href=\"(.*?)\"").matcher(html.toString()); + + while (m.find()) { + String link = m.group(1); + String full = resolve(url, link); + + if (full != null && belongsToDomain(full)) { + safeAdd(full); + } + } + + } catch (SocketTimeoutException e) { + log("TIMEOUT", url); + } catch (Exception e) { + log("ERROR", url + " -> " + e.getMessage()); + } + } + }; + + for (int i = 0; i < THREADS; i++) { + executor.submit(worker); + } + + executor.shutdown(); + executor.awaitTermination(30, TimeUnit.MINUTES); + + synchronized (fileLock) { + writer.flush(); + writer.close(); + } + + log("SYSTEM", "DONE. Pages: " + visited.size()); + + System.out.println("\nDONE"); + System.out.println("Pages: " + visited.size()); + System.out.println("Output: sitemap.txt.gz"); + } + + // ---------------- CHUNKED GZ WRITER ---------------- + + private static void writeChunk(String url) { + synchronized (fileLock) { + try { + writer.write(url); + writer.write(" "); + bufferCount++; + + if (bufferCount >= WRITE_CHUNK) { + writer.flush(); + bufferCount = 0; + } + + } catch (IOException e) { + log("WRITE_ERROR", e.getMessage()); + } + } + } + + // ---------------- SAFE QUEUE ---------------- + + private static void safeAdd(String url) { + if (url == null) return; + + url = normalize(url); + + if (!enqueued.add(url)) return; + + if (!visited.contains(url)) { + queue.add(url); + log("QUEUE", url); + } + } + + // ---------------- LOGGING ---------------- + + private static void log(String type, String msg) { + String line = "[" + type + "] " + msg; + System.out.println(line); + + synchronized (logLock) { + try (FileWriter fw = new FileWriter(LOG_FILE, true)) { + fw.write(line + "\n"); + } catch (IOException ignored) {} + } + } + + // ---------------- HELPERS ---------------- + + private static boolean isAsset(String url) { + url = url.toLowerCase(); + for (String ext : SKIP_EXT) { + if (url.endsWith(ext)) return true; + } + return false; + } + + private static String normalize(String url) { + try { + URL u = new URL(url); + return u.getProtocol() + "://" + u.getHost() + u.getPath().replaceAll("/$", ""); + } catch (Exception e) { + return url.split("\\?")[0].split("#")[0].replaceAll("/$", ""); + } + } + + private static boolean belongsToDomain(String url) { + try { + return new URL(url).getHost().endsWith(domain); + } catch (Exception e) { + return false; + } + } + + private static String resolve(String base, String link) { + if (link == null) return null; + + if (link.startsWith("mailto:") || + link.startsWith("tel:") || + link.startsWith("javascript:")) return null; + + try { + return new URL(new URL(base), link).toString().split("#")[0]; + } catch (Exception e) { + return null; + } + } +}