Use dos2unix instead of sed
Add sed workaround for matching new line https://stackoverflow.com/a/1252191
This commit is contained in:
		
							parent
							
								
									b76378f607
								
							
						
					
					
						commit
						88d6447fe0
					
				|  | @ -15,15 +15,17 @@ wget https://urlhaus.abuse.ch/downloads/csv/ -O ../src/URLhaus.csv | |||
| 
 | ||||
| cat ../src/URLhaus.csv | \ | ||||
| # Convert DOS to Unix line ending | ||||
| sed -z -e 's/\r\n/\n/g' | \ | ||||
| dos2unix | \ | ||||
| # Parse online URLs only | ||||
| grep '"online"' | \ | ||||
| # Parse domains and IP address only | ||||
| cut -f 6 -d '"' | \ | ||||
| cut -f 3 -d '/' | \ | ||||
| cut -f 1 -d ':' | \ | ||||
| # Remove www. | ||||
| sed -z -e 's/\nwww\./\n/g' | \ | ||||
| # Remove www | ||||
| # Only matches domains that start with www | ||||
| # Not examplewww.com | ||||
| sed ':a;N;$!ba;s/\nwww\./\n/g' | \ | ||||
| # Sort and remove duplicates | ||||
| sort -u | \ | ||||
| # Exclude Umbrella Top 1M. grep inverse match whole line | ||||
|  |  | |||
|  | @ -10,11 +10,13 @@ wget https://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip -O top-1m | |||
| # Decompress the zip and write output to stdout | ||||
| unzip -p top-1m.csv.zip | \ | ||||
| # Convert DOS to Unix line ending | ||||
| sed -z -e 's/\r\n/\n/g' | \ | ||||
| dos2unix | \ | ||||
| # Parse domains only | ||||
| cut -f 2 -d ',' | \ | ||||
| # Remove www. | ||||
| sed -z -e 's/\nwww\./\n/g' | \ | ||||
| # Remove www | ||||
| # Only matches domains that start with www | ||||
| # Not examplewww.com | ||||
| sed ':a;N;$!ba;s/\nwww\./\n/g' | \ | ||||
| # Remove duplicates | ||||
| sort -u > ../src/top-1m.txt | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue