init
This commit is contained in:
BIN
Sitemap.class
Normal file
BIN
Sitemap.class
Normal file
Binary file not shown.
229
Sitemap.java
Normal file
229
Sitemap.java
Normal file
@@ -0,0 +1,229 @@
|
||||
import java.io.*;
|
||||
import java.net.*;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.*;
|
||||
import java.util.regex.*;
|
||||
import java.util.zip.GZIPOutputStream;
|
||||
|
||||
public class Sitemap {
|
||||
|
||||
private static final Set<String> visited = ConcurrentHashMap.newKeySet();
|
||||
private static final Set<String> enqueued = ConcurrentHashMap.newKeySet();
|
||||
private static final BlockingQueue<String> queue = new LinkedBlockingQueue<>();
|
||||
|
||||
private static final String[] SKIP_EXT = {
|
||||
".pdf", ".zip", ".rar", ".7z", ".tar", ".gz",
|
||||
".exe", ".dmg", ".iso",
|
||||
".mp4", ".mkv", ".avi", ".mov",
|
||||
".mp3", ".wav",
|
||||
".jpg", ".jpeg", ".png", ".gif", ".webp",
|
||||
".apk", ".msi",
|
||||
".bundle",
|
||||
".jar", ".patch"
|
||||
};
|
||||
|
||||
private static final int MAX_PAGES = 5_000_000;
|
||||
private static final int WRITE_CHUNK = 50;
|
||||
|
||||
private static String domain;
|
||||
|
||||
// ---------------- OUTPUT STREAM (GZIP) ----------------
|
||||
private static final Object fileLock = new Object();
|
||||
private static GZIPOutputStream gzipOut;
|
||||
private static BufferedWriter writer;
|
||||
private static int bufferCount = 0;
|
||||
|
||||
private static final Object logLock = new Object();
|
||||
private static final String LOG_FILE = "crawl.log";
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
if (args.length == 0) {
|
||||
System.out.println("Usage: java Sitemap https://example.com");
|
||||
return;
|
||||
}
|
||||
|
||||
String startUrl = normalize(args[0]);
|
||||
domain = new URL(startUrl).getHost();
|
||||
|
||||
gzipOut = new GZIPOutputStream(new FileOutputStream("sitemap.txt.gz"));
|
||||
writer = new BufferedWriter(new OutputStreamWriter(gzipOut));
|
||||
|
||||
safeAdd(startUrl);
|
||||
|
||||
int THREADS = Runtime.getRuntime().availableProcessors() * 2;
|
||||
System.out.println("Using " + THREADS + " threads");
|
||||
|
||||
ExecutorService executor = Executors.newFixedThreadPool(THREADS);
|
||||
|
||||
Runnable worker = () -> {
|
||||
while (visited.size() < MAX_PAGES) {
|
||||
|
||||
String url = queue.poll();
|
||||
if (url == null) return;
|
||||
|
||||
url = normalize(url);
|
||||
|
||||
if (!visited.add(url)) continue;
|
||||
|
||||
if (isAsset(url)) {
|
||||
log("SKIP", url);
|
||||
continue;
|
||||
}
|
||||
|
||||
writeChunk(url);
|
||||
|
||||
try {
|
||||
HttpURLConnection conn = (HttpURLConnection) new URL(url).openConnection();
|
||||
|
||||
conn.setRequestMethod("GET");
|
||||
conn.setConnectTimeout(1000);
|
||||
conn.setReadTimeout(1000);
|
||||
conn.setRequestProperty("User-Agent", "SitemapBot");
|
||||
|
||||
String type = conn.getContentType();
|
||||
if (type == null || !type.contains("text/html")) {
|
||||
log("SKIP", url + " (non-html)");
|
||||
continue;
|
||||
}
|
||||
|
||||
BufferedReader in = new BufferedReader(
|
||||
new InputStreamReader(conn.getInputStream())
|
||||
);
|
||||
|
||||
StringBuilder html = new StringBuilder();
|
||||
String line;
|
||||
|
||||
while ((line = in.readLine()) != null) {
|
||||
html.append(line);
|
||||
}
|
||||
|
||||
in.close();
|
||||
|
||||
Matcher m = Pattern.compile("href=\"(.*?)\"").matcher(html.toString());
|
||||
|
||||
while (m.find()) {
|
||||
String link = m.group(1);
|
||||
String full = resolve(url, link);
|
||||
|
||||
if (full != null && belongsToDomain(full)) {
|
||||
safeAdd(full);
|
||||
}
|
||||
}
|
||||
|
||||
} catch (SocketTimeoutException e) {
|
||||
log("TIMEOUT", url);
|
||||
} catch (Exception e) {
|
||||
log("ERROR", url + " -> " + e.getMessage());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
for (int i = 0; i < THREADS; i++) {
|
||||
executor.submit(worker);
|
||||
}
|
||||
|
||||
executor.shutdown();
|
||||
executor.awaitTermination(30, TimeUnit.MINUTES);
|
||||
|
||||
synchronized (fileLock) {
|
||||
writer.flush();
|
||||
writer.close();
|
||||
}
|
||||
|
||||
log("SYSTEM", "DONE. Pages: " + visited.size());
|
||||
|
||||
System.out.println("\nDONE");
|
||||
System.out.println("Pages: " + visited.size());
|
||||
System.out.println("Output: sitemap.txt.gz");
|
||||
}
|
||||
|
||||
// ---------------- CHUNKED GZ WRITER ----------------
|
||||
|
||||
private static void writeChunk(String url) {
|
||||
synchronized (fileLock) {
|
||||
try {
|
||||
writer.write(url);
|
||||
writer.write(" ");
|
||||
bufferCount++;
|
||||
|
||||
if (bufferCount >= WRITE_CHUNK) {
|
||||
writer.flush();
|
||||
bufferCount = 0;
|
||||
}
|
||||
|
||||
} catch (IOException e) {
|
||||
log("WRITE_ERROR", e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------- SAFE QUEUE ----------------
|
||||
|
||||
private static void safeAdd(String url) {
|
||||
if (url == null) return;
|
||||
|
||||
url = normalize(url);
|
||||
|
||||
if (!enqueued.add(url)) return;
|
||||
|
||||
if (!visited.contains(url)) {
|
||||
queue.add(url);
|
||||
log("QUEUE", url);
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------- LOGGING ----------------
|
||||
|
||||
private static void log(String type, String msg) {
|
||||
String line = "[" + type + "] " + msg;
|
||||
System.out.println(line);
|
||||
|
||||
synchronized (logLock) {
|
||||
try (FileWriter fw = new FileWriter(LOG_FILE, true)) {
|
||||
fw.write(line + "\n");
|
||||
} catch (IOException ignored) {}
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------- HELPERS ----------------
|
||||
|
||||
private static boolean isAsset(String url) {
|
||||
url = url.toLowerCase();
|
||||
for (String ext : SKIP_EXT) {
|
||||
if (url.endsWith(ext)) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private static String normalize(String url) {
|
||||
try {
|
||||
URL u = new URL(url);
|
||||
return u.getProtocol() + "://" + u.getHost() + u.getPath().replaceAll("/$", "");
|
||||
} catch (Exception e) {
|
||||
return url.split("\\?")[0].split("#")[0].replaceAll("/$", "");
|
||||
}
|
||||
}
|
||||
|
||||
private static boolean belongsToDomain(String url) {
|
||||
try {
|
||||
return new URL(url).getHost().endsWith(domain);
|
||||
} catch (Exception e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
private static String resolve(String base, String link) {
|
||||
if (link == null) return null;
|
||||
|
||||
if (link.startsWith("mailto:") ||
|
||||
link.startsWith("tel:") ||
|
||||
link.startsWith("javascript:")) return null;
|
||||
|
||||
try {
|
||||
return new URL(new URL(base), link).toString().split("#")[0];
|
||||
} catch (Exception e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user