This commit is contained in:
2026-05-07 09:51:54 -04:00
parent 2a9b561db0
commit ee80b831d5
2 changed files with 229 additions and 0 deletions

BIN
Sitemap.class Normal file

Binary file not shown.

229
Sitemap.java Normal file
View File

@@ -0,0 +1,229 @@
import java.io.*;
import java.net.*;
import java.util.*;
import java.util.concurrent.*;
import java.util.regex.*;
import java.util.zip.GZIPOutputStream;
public class Sitemap {
private static final Set<String> visited = ConcurrentHashMap.newKeySet();
private static final Set<String> enqueued = ConcurrentHashMap.newKeySet();
private static final BlockingQueue<String> queue = new LinkedBlockingQueue<>();
private static final String[] SKIP_EXT = {
".pdf", ".zip", ".rar", ".7z", ".tar", ".gz",
".exe", ".dmg", ".iso",
".mp4", ".mkv", ".avi", ".mov",
".mp3", ".wav",
".jpg", ".jpeg", ".png", ".gif", ".webp",
".apk", ".msi",
".bundle",
".jar", ".patch"
};
private static final int MAX_PAGES = 5_000_000;
private static final int WRITE_CHUNK = 50;
private static String domain;
// ---------------- OUTPUT STREAM (GZIP) ----------------
private static final Object fileLock = new Object();
private static GZIPOutputStream gzipOut;
private static BufferedWriter writer;
private static int bufferCount = 0;
private static final Object logLock = new Object();
private static final String LOG_FILE = "crawl.log";
public static void main(String[] args) throws Exception {
if (args.length == 0) {
System.out.println("Usage: java Sitemap https://example.com");
return;
}
String startUrl = normalize(args[0]);
domain = new URL(startUrl).getHost();
gzipOut = new GZIPOutputStream(new FileOutputStream("sitemap.txt.gz"));
writer = new BufferedWriter(new OutputStreamWriter(gzipOut));
safeAdd(startUrl);
int THREADS = Runtime.getRuntime().availableProcessors() * 2;
System.out.println("Using " + THREADS + " threads");
ExecutorService executor = Executors.newFixedThreadPool(THREADS);
Runnable worker = () -> {
while (visited.size() < MAX_PAGES) {
String url = queue.poll();
if (url == null) return;
url = normalize(url);
if (!visited.add(url)) continue;
if (isAsset(url)) {
log("SKIP", url);
continue;
}
writeChunk(url);
try {
HttpURLConnection conn = (HttpURLConnection) new URL(url).openConnection();
conn.setRequestMethod("GET");
conn.setConnectTimeout(1000);
conn.setReadTimeout(1000);
conn.setRequestProperty("User-Agent", "SitemapBot");
String type = conn.getContentType();
if (type == null || !type.contains("text/html")) {
log("SKIP", url + " (non-html)");
continue;
}
BufferedReader in = new BufferedReader(
new InputStreamReader(conn.getInputStream())
);
StringBuilder html = new StringBuilder();
String line;
while ((line = in.readLine()) != null) {
html.append(line);
}
in.close();
Matcher m = Pattern.compile("href=\"(.*?)\"").matcher(html.toString());
while (m.find()) {
String link = m.group(1);
String full = resolve(url, link);
if (full != null && belongsToDomain(full)) {
safeAdd(full);
}
}
} catch (SocketTimeoutException e) {
log("TIMEOUT", url);
} catch (Exception e) {
log("ERROR", url + " -> " + e.getMessage());
}
}
};
for (int i = 0; i < THREADS; i++) {
executor.submit(worker);
}
executor.shutdown();
executor.awaitTermination(30, TimeUnit.MINUTES);
synchronized (fileLock) {
writer.flush();
writer.close();
}
log("SYSTEM", "DONE. Pages: " + visited.size());
System.out.println("\nDONE");
System.out.println("Pages: " + visited.size());
System.out.println("Output: sitemap.txt.gz");
}
// ---------------- CHUNKED GZ WRITER ----------------
private static void writeChunk(String url) {
synchronized (fileLock) {
try {
writer.write(url);
writer.write(" ");
bufferCount++;
if (bufferCount >= WRITE_CHUNK) {
writer.flush();
bufferCount = 0;
}
} catch (IOException e) {
log("WRITE_ERROR", e.getMessage());
}
}
}
// ---------------- SAFE QUEUE ----------------
private static void safeAdd(String url) {
if (url == null) return;
url = normalize(url);
if (!enqueued.add(url)) return;
if (!visited.contains(url)) {
queue.add(url);
log("QUEUE", url);
}
}
// ---------------- LOGGING ----------------
private static void log(String type, String msg) {
String line = "[" + type + "] " + msg;
System.out.println(line);
synchronized (logLock) {
try (FileWriter fw = new FileWriter(LOG_FILE, true)) {
fw.write(line + "\n");
} catch (IOException ignored) {}
}
}
// ---------------- HELPERS ----------------
private static boolean isAsset(String url) {
url = url.toLowerCase();
for (String ext : SKIP_EXT) {
if (url.endsWith(ext)) return true;
}
return false;
}
private static String normalize(String url) {
try {
URL u = new URL(url);
return u.getProtocol() + "://" + u.getHost() + u.getPath().replaceAll("/$", "");
} catch (Exception e) {
return url.split("\\?")[0].split("#")[0].replaceAll("/$", "");
}
}
private static boolean belongsToDomain(String url) {
try {
return new URL(url).getHost().endsWith(domain);
} catch (Exception e) {
return false;
}
}
private static String resolve(String base, String link) {
if (link == null) return null;
if (link.startsWith("mailto:") ||
link.startsWith("tel:") ||
link.startsWith("javascript:")) return null;
try {
return new URL(new URL(base), link).toString().split("#")[0];
} catch (Exception e) {
return null;
}
}
}