From ee80b831d52ed8809873b1db0fa43dc6e257dbd5 Mon Sep 17 00:00:00 2001 From: Astronand Date: Thu, 7 May 2026 09:51:54 -0400 Subject: [PATCH] init --- Sitemap.class | Bin 0 -> 8215 bytes Sitemap.java | 229 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 229 insertions(+) create mode 100644 Sitemap.class create mode 100644 Sitemap.java diff --git a/Sitemap.class b/Sitemap.class new file mode 100644 index 0000000000000000000000000000000000000000..3c3b63e635a56e88ccbc1844e97892fb1c577a1a GIT binary patch literal 8215 zcmX^0Z`VEs1_oQk=Ufa-49x5dEIbUX3~Y=H0$GV=iTXK-dFlH8Nm;4MC5#MgHko;u zC3cJq%o>_uoD3Wcoa_u-JPh0nJSfV8D~n4~a~TWR#Q?6kF-*r&c897UZPrCFkdIF$gdS zvNH(rFbFe$Z!25AO4P{@gZ zSPDE0iVRAO4D9KtB_8?3C5#Lr8k(5Ka59K7sPHhTGN>^!u%+bZCT8X_G6-R+V`tD{ zWRL?}S6Y&pqhFO-pzj{#8Q@=9QcwyFQFaC`Mg|G6n#_ECx6GVWWLXXd9YzMeVn`6{ zl~k1IrB`t=XfnugF{m@>gWPSv$RG*zd}h8rvR-RYT#GRn@h})On1K9Lm092qidhAS zOL02Mnw`OnkwFaRCa2Q0wA7;1l<=a=lGGx01`9?8DVQ9xeGqvr26F~|E(TKuYmn<~ z7#V~Su7jw6y33A-!Jff^k%6roY%U{%1jJdH`TD3fT7y!66AyzkC=Ia}C#I!3rlfE& zxH7o0Gr03GcrbV}G6;el1Wq_XrFkWpxv7i{TTSJVJ%7L8f@~F!(U|GBOA! zmL+E9Bqrsg1{CEdrxq9I7ZrnZjHeI-1A{*gLjXe{BZGKuVs@%?eqM57NqA;ShI4*i zaYFmYJMtEyTdc5Xr+3#SqQNz+6M{-ecYKb*BLlZ+YJ3|W( zLn|ny@*=_}IJJb4fu%UJDpiPqiNPO~Q#yDUIvKhc8JKhO(-|4mks}IPrb0{yM;%l| zGmL|whmnDm%f;W%RfvI^!JmVnpOHZflD(}IK<>&-EPxk4rVJB#7$$-8BP%%faxhF` zWMEVP`DPjq!*qrjj11gK;9|o$zcjCek%7^ZonaPI4dChTT9KSu0Ez+*hB=H3+~Gl< zA+GVRK|%gOTnsZA=J7Dh2NlGiJnWlV461HGhA-q{SOf|xj?}!u($vz_6i~=4;bBNt|~^GXsk^NK+xuI6D_!?2c-ffZ~XBZCS= zH+pJv%E?d8&df^>1lelM&9I(f2|L3EMg}EBR3PO_6xAFIn;02b14CUyU4+O;=V3U(aFUUMH!&q8xU`_4D7CmaHHDEu0@;6X9pL)>G{YHohO;~j=Rh%x)Leli zMn(qil6;^1^3)>d#Nt$tZ!hpLTx7V!$iNZo?HLg78WF zWZ+26ODP5w&}az)VuiJ46sWep!NYJ9vEpF($;iMB zN^8Z*MVSR9R-hpJ4N6siK&2_fSJ2j#HH3%kAy9)(n&BT0!+%f^bCl#mJj%@A*jBGrN>>xL?XBIma7pInRFmf_7uz*qsNS2$2 zkq1;fae<2`=Zw<4Y%T_AMm`=!entUC2EP1))I8_>yu8$8P!Yt)AO))>p$}Pch?XuMnOg~9!7DHy#mFlB|)i$rK!awzNsY{`6(bt zNghTiMrlR{!Q#{ssD4l_1#WG#Xn1OdffUK|Fv@`x@qtZ9Oo6K4U{qja;0i5HEz)&N zPt7agU{qpc;DWY=o$^aST2*)$RY4`c5LheN@_?fJg4CjtN|3ZV52FSs%=yy6)|aH_ zm4s9lq;fE7F*0zLq*j#ZXO!gTa50=_)Zt-R$*9Z7AdQ@koih@Pf`m7?hfrl3K*hXvD}M2}&oRCckGMyu}NV2D!+DhtZVLjFAE4A|wTj4C1ip z$jsMAlCb7tG-R~kVPIvnL=-S+NEBg3O#$y!K*hv<7+VAIO#<9>!qC5O8KMNzOUWMGB1DV{UT6Jp?C*uuq_%b3T`n9svlz*xx0 zpawP&>`9~w9Mr&vw9P#di!*!^3qZZi)N=3CN^pbA6qY;*>QjMe3Cm0^w`ON7W@OO7 zYM)PLUUq5rAyEZ|_Q0p(&!AfX13P+u?56R$xY{AYy+_}i}N`cJ3y@50uv6#E-)**jDxWU z#7Zp7Xo7aSQ|bZlv)%5Y9BE&ut3HJ#5AzfzM5f-47|Z5iOJc%i3MQw>@I+qB$iSPEn46T6r~)!x#ek84!zMWgI>y5loL^d$oC+FAU}WHd))0E2 zWXH%L;FOO;ykP@YIE zaZ5}t$uFv8WSD_m@Af_IgXfi;>j79??E=X2E*G_?vk%7lEFE6#oIVZ8WIJKCa@c|=)At+hu zqvusnN<%4qkxEfU1};RA%Fg(hkwFz~268+?0uK^Cuw=;2_!Kt!f>S9Y0~;jGGAJ^L zF)%SOFz_-+f%@hQjEu_}7#NrsS1?F{I-n^0H4JOv{FMw+psor71A`RfDux*hjEt+n z`at|ONc^=7T%gfIkUG$)0t2X5Va#9xrtKLV!1Q#68DRMx3?(3%aUFvi%shtO4BQNi zpy6HyM#l9F3=Faij0}tn3=FJV+Zh-)GB7YOGHzgCU|<6aaxic*Ze-lVzywmtxSN5M zft`VY;iJ~RSqwaz7}$k)w=wW<)M7T;!653hn?X8Kdpm>dHU|0449dG1)FXE@XhsTY zZ)ecm&Y-uM!EiT&Y2b-HagH zEf`q&{xc*qu(304VP`na$j%VU&bXD4f#DZ}$bSZHG;wwYCI&`!h9kci*crDoFfgz% zFfu%6-~#n;ps~M~fr){Ifq}st6mUMEaMRkxFkuset38JHQ^81xy87-llCG1xIUGB7YW zGPr+zy%uFg?c3)-WwUc2n6T>V9mW`VjYPEJUEC+FfH!-Zz+R3nvi6LAFq_Q29?ji1z1E=sM46F>x z7`PZ#Fo-j(Vo+vS&7jM$hQXL&9oQ|b49u*8(hOXn!7_;DjJymi3>*v$3^7{TyFkgq zcM}7<4meS5i}X9Rn_*|9&~BFP4129uBw2KJF&qFTuS1e7+ZYZnXOOgFg|c`+sZ^3} z8^bY4*8R&FSglxe4=-onvSQagEXlrqBLgdgB7+Wt3BwTvGX`r0JBA|+q6}UPz6>)N zL>U4ZLKqkrLKq^zG{iGn3|!z8%fYaPfuCU;gDAsx21SM)3_1)u8B7?qGk7uVWbkL$ z#SqP~8|*Q629Ey>CJe0X45wj;iJf65I1T-0;ACKfiJoR>07{>ns=6?))?2KC&8AX0DurhFg zW_h5N7BR4bOJOTesid`?LDolm8^d+&Z4C0-+Zb*M-4(jOjp5-gh9?XR8+^2cUT$M} z&1@v}9zsDRH-HO3MFu5?=M0e01P#G4Fq~xIWH`m3#BiEHnc)oB9c&DYN$lW!mCSGr zYQHA~GuVDnnEl%rOo#EFu zhQClBZt!IU`A(U^mcb701W3Za$-u^Nn?Z=-4udqqT?S=_2Ml%$4;kzk9)X?2$-u%U z#TdoT@LYfkRP94t#pJ@k%D~CMz^Dr=KZO|?7BMPrVpz44k&%(%0K;i-99t1|g7fyBPJr<@@sg*}EC_BX=_zMrvN7=I4cfGj$;f0k{ml26K67TNOJ68a9sZX z+HwYYA7REcu%8%2e1sV@w6wP|X6<6khPdh9^^FWH3?2+q8F?8~7-leXfoTw5g5f!s z527LIZXSa=0|SE%10RDOgA{{3gFJ&Hg9?K)gE50EgB61tgCm1GgA0Qfg9n2zLpXy! zLmWc@Ln=ccLk>d_Lm5K|Lp4JvLnlKR!&HWFh8YY|3=0{e8CEmIF>GN-WZ1`$#Bh=! zmEkNyI>TLtOopcnSqyI(vKhWIX7UWU_=%)*SN!I3P&jOD?Rtip_y!I5mjjMc%B?81z-!I2!ojP=2h z0&K#Jjlq!u?81!A!I1(S!i=rKkpi5;jP1dZ0$jq3oxza;+`^3A!I1(y!i>GakpjHJ zjQzop0(`=Z6N4iK_=OoK2S*AB2s2I%jua3SW}F@zDIg@wI5Rj>Kva z;79>cVaEBvkpg1Ej0=M!1;m9J7Y9cQNC-174c^Rfem4VCX~O|3KVFo-emF&t-*XSm3q#Bh~Cg@KuI7r0Q|&A11g7&saCGB7e8W<1Kk z%y@?JEaMq4{{oo21SYS5$!lQp2AI4BChsubV_;wqVqjvt&zQ^jkb#r&5#tlaXAF`6 D95HvC literal 0 HcmV?d00001 diff --git a/Sitemap.java b/Sitemap.java new file mode 100644 index 0000000..1cc9869 --- /dev/null +++ b/Sitemap.java @@ -0,0 +1,229 @@ +import java.io.*; +import java.net.*; +import java.util.*; +import java.util.concurrent.*; +import java.util.regex.*; +import java.util.zip.GZIPOutputStream; + +public class Sitemap { + + private static final Set visited = ConcurrentHashMap.newKeySet(); + private static final Set enqueued = ConcurrentHashMap.newKeySet(); + private static final BlockingQueue queue = new LinkedBlockingQueue<>(); + + private static final String[] SKIP_EXT = { + ".pdf", ".zip", ".rar", ".7z", ".tar", ".gz", + ".exe", ".dmg", ".iso", + ".mp4", ".mkv", ".avi", ".mov", + ".mp3", ".wav", + ".jpg", ".jpeg", ".png", ".gif", ".webp", + ".apk", ".msi", + ".bundle", + ".jar", ".patch" + }; + + private static final int MAX_PAGES = 5_000_000; + private static final int WRITE_CHUNK = 50; + + private static String domain; + + // ---------------- OUTPUT STREAM (GZIP) ---------------- + private static final Object fileLock = new Object(); + private static GZIPOutputStream gzipOut; + private static BufferedWriter writer; + private static int bufferCount = 0; + + private static final Object logLock = new Object(); + private static final String LOG_FILE = "crawl.log"; + + public static void main(String[] args) throws Exception { + + if (args.length == 0) { + System.out.println("Usage: java Sitemap https://example.com"); + return; + } + + String startUrl = normalize(args[0]); + domain = new URL(startUrl).getHost(); + + gzipOut = new GZIPOutputStream(new FileOutputStream("sitemap.txt.gz")); + writer = new BufferedWriter(new OutputStreamWriter(gzipOut)); + + safeAdd(startUrl); + + int THREADS = Runtime.getRuntime().availableProcessors() * 2; + System.out.println("Using " + THREADS + " threads"); + + ExecutorService executor = Executors.newFixedThreadPool(THREADS); + + Runnable worker = () -> { + while (visited.size() < MAX_PAGES) { + + String url = queue.poll(); + if (url == null) return; + + url = normalize(url); + + if (!visited.add(url)) continue; + + if (isAsset(url)) { + log("SKIP", url); + continue; + } + + writeChunk(url); + + try { + HttpURLConnection conn = (HttpURLConnection) new URL(url).openConnection(); + + conn.setRequestMethod("GET"); + conn.setConnectTimeout(1000); + conn.setReadTimeout(1000); + conn.setRequestProperty("User-Agent", "SitemapBot"); + + String type = conn.getContentType(); + if (type == null || !type.contains("text/html")) { + log("SKIP", url + " (non-html)"); + continue; + } + + BufferedReader in = new BufferedReader( + new InputStreamReader(conn.getInputStream()) + ); + + StringBuilder html = new StringBuilder(); + String line; + + while ((line = in.readLine()) != null) { + html.append(line); + } + + in.close(); + + Matcher m = Pattern.compile("href=\"(.*?)\"").matcher(html.toString()); + + while (m.find()) { + String link = m.group(1); + String full = resolve(url, link); + + if (full != null && belongsToDomain(full)) { + safeAdd(full); + } + } + + } catch (SocketTimeoutException e) { + log("TIMEOUT", url); + } catch (Exception e) { + log("ERROR", url + " -> " + e.getMessage()); + } + } + }; + + for (int i = 0; i < THREADS; i++) { + executor.submit(worker); + } + + executor.shutdown(); + executor.awaitTermination(30, TimeUnit.MINUTES); + + synchronized (fileLock) { + writer.flush(); + writer.close(); + } + + log("SYSTEM", "DONE. Pages: " + visited.size()); + + System.out.println("\nDONE"); + System.out.println("Pages: " + visited.size()); + System.out.println("Output: sitemap.txt.gz"); + } + + // ---------------- CHUNKED GZ WRITER ---------------- + + private static void writeChunk(String url) { + synchronized (fileLock) { + try { + writer.write(url); + writer.write(" "); + bufferCount++; + + if (bufferCount >= WRITE_CHUNK) { + writer.flush(); + bufferCount = 0; + } + + } catch (IOException e) { + log("WRITE_ERROR", e.getMessage()); + } + } + } + + // ---------------- SAFE QUEUE ---------------- + + private static void safeAdd(String url) { + if (url == null) return; + + url = normalize(url); + + if (!enqueued.add(url)) return; + + if (!visited.contains(url)) { + queue.add(url); + log("QUEUE", url); + } + } + + // ---------------- LOGGING ---------------- + + private static void log(String type, String msg) { + String line = "[" + type + "] " + msg; + System.out.println(line); + + synchronized (logLock) { + try (FileWriter fw = new FileWriter(LOG_FILE, true)) { + fw.write(line + "\n"); + } catch (IOException ignored) {} + } + } + + // ---------------- HELPERS ---------------- + + private static boolean isAsset(String url) { + url = url.toLowerCase(); + for (String ext : SKIP_EXT) { + if (url.endsWith(ext)) return true; + } + return false; + } + + private static String normalize(String url) { + try { + URL u = new URL(url); + return u.getProtocol() + "://" + u.getHost() + u.getPath().replaceAll("/$", ""); + } catch (Exception e) { + return url.split("\\?")[0].split("#")[0].replaceAll("/$", ""); + } + } + + private static boolean belongsToDomain(String url) { + try { + return new URL(url).getHost().endsWith(domain); + } catch (Exception e) { + return false; + } + } + + private static String resolve(String base, String link) { + if (link == null) return null; + + if (link.startsWith("mailto:") || + link.startsWith("tel:") || + link.startsWith("javascript:")) return null; + + try { + return new URL(new URL(base), link).toString().split("#")[0]; + } catch (Exception e) { + return null; + } + } +}