You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

699 lines
23 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

"""
とくてースクリプト 1.0
使い方
最初にPythonを入れる
その次にpip3 install twintを打つ
このファイルを実行する
後は指示に従う
id入力は空白を入れることで複数入力が可能
直接書き込む場合はlist入力と同じように
やってくれること
現在のツイート、それに対するリプライを取得
外部ツイート保存サービス「ツイログ」に3200件までツイートを保存
wayback machineから消されたツイートを取得
メディアにある画像をダウンロード
画像以外のメディアをjsonで出力
注意
CRITICAL:root:twint.feed:Follow:IndexError
↑これが出力されても気にしないでください
エラーが出たら当職にSSか文章と共に送ってください
定期的に改善します
"""
#入力する場合は1
#直接コードに書き込む場合は0
user_check = 1
#パズピー検地のために使う正規表現はここに入れてください
#よくわからない場合はそのままで
#ダブルクォーテーション前のrは消さないでください
seiki = r"学校|中学|高校|特定|炎上|課題|晒|複垢|副垢|アカウント|休み|始業式|部活|部|本名|終業式|リア友|授業|体育祭|文化祭|学園祭|住所|地震|じしん|歳|才|誕生日|風船|ふうせん|揺れ|ゆれ|大会|合唱|部活|学祭|中総体|県体|練習試合|バイト|顔|晒す|さらす|記念日|振替|ふりかえ|学級閉鎖|クラス|年|組|定時制|全日制|科|郵便番号|インスタ|フェイスブック|SNS|賞状|賞|苗字|名前|修学旅行|行事|体育祭|運動会|受験|卒業|塾|考査|テスト|住所|メアド|郵便番号|休校|旅行|習い事|コンビニ|お店|自分語り|自己紹介|入学式|学校祭|学園祭|臨時|休校|バス|バス停|遅延|遅れ|満員|混んでる|空いてる|ガラガラ|コンビニ|セブン|ファミマ|ローソン|ミニストップ|デイリーヤマザキ|デイリー|セイコーマート|セコマ|まいばすけっと|マイバス|ニューデイズ|NewDays|ポプラ|スーパー|ファミレス|サイゼリヤ|サイゼリア|サイゼ|ガスト|イオン|アウトレット|雨|曇り|雲|晴れ|晴|風|台風|警報|注意報|地震|火事|火災|消防車|救急車|事故|ニュース|近所|近く|徒歩|すぐそこ|隣|役所|役場|家|自宅|宅|うち|マンション|アパート|階|エレベーター|部屋|LDK|ポスト|窓|ベランダ|庭|学校|中学|高校|大学|休校|休み|部活|誕生日|バースデー|家族|親|実家|父|父親|親父|パパ|母|母親|おふくろ|ママ|祖父|じいちゃん|祖母|ばあちゃん|親戚|友人|友達|本名|名前|名字|苗字|由来|あだ名|駅|改札|電車|線|快速|普通|各停|各駅|急行|特急|準急|快特|特快|区間|JR|東武|西武|京成|京王|東急|京急|小田急|相鉄|名鉄|近鉄|南海|京阪|阪神|阪急|西鉄|地下鉄|都営|メトロ|市営"
#直接IDを入れる場合はここに
user_s = ["KoushinLawfirm","hakusuiriki"]
#ここからコード
import requests
from bs4 import BeautifulSoup
import json
import ast
import time
import twint
import os
import re
import threading
import queue
list_ = []
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0' }
url_ = "https://web.archive.org/web/"
twilog_url = "http://xn--eckyazdvi.xn--vcki1fxh883oon2c.com"
idtwi_url = "http://gettwitterid.com/?user_name="
flag = "0"
user_number = None
twilog_url_ = "https://xn--eckyazdvi.xn--vcki1fxh883oon2c.com/%E3%83%A6%E3%83%BC%E3%82%B6%E3%83%BC%E8%A9%B3%E7%B4%B0/readmore.php?type=statuses&user_id="
wayback_text = []
wayback_list = queue.Queue()
wayback_list2 = queue.Queue()
image_list = queue.Queue()
image_list2 = queue.Queue()
check_yes = ["Y","y","yes","Yes"]
check_no = ["N","n","no","No"]
def twilog_post(user_):
twilog_data = {
"text": user_,
"type": "user"
}
print('ツイログ通信中')
r = requests.post(twilog_url,data=twilog_data)
print('ツイログ通信終了')
def id_get(user_):
global user_number
print('ID取得中')
try:
r = requests.get(idtwi_url+user_,headers=headers)
soup = BeautifulSoup(r.text,"html5lib")
#user_number = soup.select_one('body > main > article > section:nth-child(2) > div > div > div > dl > dd:nth-child(4) > a')["href"].strip('/')
user_number = soup.select_one('body > div.content > div.info_container > table > tbody > tr:nth-child(1) > td:nth-child(2) > p').text
except:
user_number = 0
print('ID存在せず')
print('ID取得終了')
def tweet_get(user_):
while True:
try:
print('ツイート取得開始')
if user_number == 0:
print('User is not found (tweet_get)')
break
c = twint.Config()
c.Username = user_
c.Output = user_+"/tweet.json"
c.Store_json = True
c.Hide_output = True
twint.run.Search(c)
break
except:
print('ツイート取得 3分停止')
time.sleep(180)
print('ツイート取得終了')
def reply_get(user_):
while True:
try:
print('リプライ取得開始')
if user_number == 0:
print('User is not found (reply_get)')
break
c = twint.Config()
c.Search = user_+"-from:"+user_
c.Output = user_+"/reply.json"
c.Store_json = True
c.Hide_output = True
twint.run.Search(c)
break
except:
print('リプライ取得 3分停止')
time.sleep(180)
print('リプライ取得終了')
def retweet_get(user_):
while True:
try:
print('リツイート取得開始')
if user_number == 0:
print('User is not found (retweet_get)')
break
c = twint.Config()
c.Username = user_
c.Output = user_+"/retweet.json"
c.Store_json = True
c.Hide_output = True
c.Retweets = True
twint.run.Search(c)
break
except:
print('リツイート取得 3分停止')
time.sleep(180)
print('リツイート取得終了')
def photo_get(user_):
while True:
try:
print('画像取得開始')
if user_number == 0:
print('User is not found (photo_get)')
break
c = twint.Config()
c.Username = user_
c.Output = user_+"/photo.json"
c.Store_json = True
c.Hide_output = True
c.Images = True
twint.run.Search(c)
break
except:
print('画像取得開始 3分停止')
time.sleep.sleep(180)
print('画像取得終了')
def follower_get(user_):
while True:
try:
time.sleep(3)
print('フォロワー取得開始')
if user_number == 0:
print('User is not found (follower_get)')
break
c = twint.Config()
c.Username = user_
c.Output = user_+"/follower.json"
c.Store_json = True
c.Hide_output = True
c.User_full = True
twint.run.Followers(c)
break
except:
print('フォロワー取得 3分停止')
time.sleep(180)
print('フォロワー取得終了')
def follow_get(user_):
while True:
try:
time.sleep(3)
print('フォロー取得開始')
if user_number == 0:
print('User is not found (Follow_get)')
break
c = twint.Config()
c.Username = user_
c.Output = user_+"/follow.json"
c.Store_json = True
c.Hide_output = True
c.User_full = True
twint.run.Following(c)
break
except:
print('フォロー取得 3分停止')
time.sleep(180)
print('フォロー取得終了')
def FF_get(user_):
t6.join()
t7.join()
print('FF start')
if user_number == 0:
print('User is not found(FF)')
return
try:
r = open(user_+'/follow.json','r',encoding='utf-8')
follow_ = [json.loads(i) for i in r.readlines()]
r.close()
r = open(user_+'/follower.json','r',encoding='utf-8')
follower_ = [json.loads(i) for i in r.readlines()]
r.close()
ff_ = []
for i in follower_:
if i in follow_:
ff_.append(i)
r = open(user_+'/FF.json','w',encoding='utf-8')
for i in ff_:
r.write(str(i)+'\n')
r.close()
except:
pass
print('FF end')
def wayback_get(user_):
global wayback_text
t3.join()
t5.join()
url2 = "https://web.archive.org/web/timemap/?url=https%3A%2F%2Ftwitter.com%2F"+user_+"%2F%2F&matchType=prefix&collapse=urlkey&output=json&fl=original%2Cmimetype%2Ctimestamp%2Cendtimestamp%2Cgroupcount%2Cuniqcount&filter=!statuscode%3A%5B45%5D..&limit=100000&_=1588002165678"
print('wayback machine start')
#print(url2)
while True:
try:
r = requests.get(url2,headers=headers).text
except:
print('error')
time.sleep(2)
continue
break
#print(user_)
time.sleep(2)
list2 = ast.literal_eval(r)
list2.pop(0)
list_.extend(list2)
json_all = []
if os.path.exists(user_+"/tweet.json"):
r = open(user_+"/tweet.json",'r',encoding='utf-8')
json_all.extend([json.loads(i.replace('\\U','')) for i in r.readlines() if i != '\n'])
r.close()
if os.path.exists(user_+"/retweet.json"):
r = open(user_+"/retweet.json",'r',encoding='utf-8')
json_all.extend([json.loads(i.replace('\\U','')) for i in r.readlines() if i != '\n'])
r.close()
json_all = [str(i["id"]) for i in json_all]
file_name = user_+"/web_archive.txt"
file = open(file_name,'a',encoding='utf-8')
for i in list_:
if str(i[0].split('/')[-1]) in json_all:
#print('既に取得済み')
continue
wayback_list.put(i)
while not wayback_list.empty():
#ここからTweet deleted check
th = threading.Thread(target=tweet_check, args=(wayback_list,))
th.start()
#r = requests.get(i[0])
#soup1 = BeautifulSoup(r.text,"html5lib")
#if "「いいね」しました" in soup1.text:
# print(url+"\nThis is available")
# continue
#else:
# print(url+"\nDeleted")
#利点:ツイート数が減って見やすくなる(rtとかが減る)
#欠点:通常ツイートで取得できてないのも逃がしてしまう
while not wayback_list2.empty() and not wayback_list.empty():
i = wayback_list2.get(block=True)
url = url_ + i[2] + '/'+ i[0]
cc = 0
while True:
try:
if cc == 5:
break
r = requests.get(url,headers=headers)
except:
print('失敗:'+str(cc))
cc += 1
continue
break
if cc == 5:
file.write(url+' ――――――――――――――――失敗\n')
print(url+'――――――――――――――――失敗')
continue
if r.history != []:
print(url+'リダイレクト=RT')
continue
print(i)
soup1 = BeautifulSoup(r.text,"html5lib")
time.sleep(1)
if "The Wayback Machine has not archived that URL." in soup1.text:
tweet_text = '削除済み、保存なし'
json_c = 'Deleted'
else:
if i[1] == 'application/json':
json_c = "json"
json_ = json.loads(soup1.text)
tweet_text = json_["text"]
if tweet_text.startswith('RT'):
print('RTのため飛ばし')
continue
else:
json_c = "Text"
tweet_text = soup1.find(class_='TweetTextSize TweetTextSize--jumbo js-tweet-text tweet-text')
if tweet_text == None:
tweet_text = '削除済み、保存なし'
json_c = 'Deleted'
else:
tweet_text = tweet_text.text
print(url)
file.write("{\"id\": "+i[0].split('/')[-1]+", \"mode\": \""+json_c+"\", \"url\": \""+url+"\", \"tweet\": \""+tweet_text.replace("\\","\\\\")+'\"}\n')
wayback_text.append("{\"id\": "+i[0].split('/')[-1]+", \"mode\": \""+json_c+"\", \"url\": \""+url+"\", \"tweet\": \""+tweet_text.replace("\\","\\\\")+'\"}\n')
file.close()
print('wayback machine end')
def twilog_get(user_):
global user_number
global flag
t1.join()
t2.join()
t3.join()
print('ツイログ start')
if user_number == 0:
print('ツイログはパスされました')
pass
else:
r = open(user_+'/tweet.json','r',encoding='utf-8')
text_ = r.readlines()
r.close()
text_.pop(-1)
date = json.loads(text_[-1])["date"].split("-")
year_ = date[0]
month_ = date[1]
twilog_url__ = twilog_url_ + str(user_number)+"&date="
while True:
data = year_ + month_
json_ = []
while True:
_url = twilog_url__ + data+"&flag="+flag
r = requests.get(_url,headers=headers)
r_ = json.loads(r.text)
try:
flag = r_["data"][-1]["id"]
except:
p = open(user_+'/twilog.json','a',encoding='utf-8')
for i in json_:
p.write(str(i)+'\n')
p.close()
print('終了')
break
print(flag)
json_.extend(r_["data"])
if json_ == []:
break
month_ = "0"+str(int(month_) -1)
if month_ == "00":
year_ = str(int(year_) -1)
print('ツイログ end')
def download_file(image_list, user_):
i = image_list.get()
dst_path = user_+"/photo/"+i.split('/')[-1]
r = requests.get(i)
file_ = open(dst_path,'wb')
file_.write(r.content)
file_.close()
def image_check(user_):
t11.join()
print('画像検査開始')
photo_url = []
if os.path.exists(user_+"/photo.json"):
r = open(user_+"/photo.json",'r',encoding='utf-8')
photo_ = [json.loads(i)["id"] for i in r.readlines()]
r = open(user_+"/photo.json",'r',encoding='utf-8')
for i in r.readlines():
photo_url.extend(json.loads(i)["photos"])
r.close()
else:
print('photo is not Found')
os.makedirs(user_+"/photo", exist_ok=True)
for i in photo_url:
image_list.put(i)
while not image_list.empty():
#ここからTweet deleted check
th2 = threading.Thread(target=download_file, args=(image_list,user_))
th2.start()
print('画像検査終了')
def pazzle(user_):
global wayback_text
t3.join()
t4.join()
t9.join()
t10.join()
t12.join()
print('パズルのピースが埋まっていく')
tweets = []
if os.path.exists(user_+"/tweet.json"):
r = open(user_+"/tweet.json",'r',encoding='utf-8')
tweets.extend(r.readlines())
r.close()
else:
print('Tweet is not Found')
if os.path.exists(user_+"/reply.json"):
r = open(user_+"/reply.json",'r',encoding='utf-8')
tweets.extend(r.readlines())
r.close()
else:
print('Reply is not Found')
if os.path.exists(user_+"/web_archive.txt"):
tweets.extend(wayback_text)
else:
print('Web_Archive is not Found')
if os.path.exists(user_+'/twilog.json'):
r = open(user_+'/twilog.json','r',encoding='utf-8')
tweets.extend(r.readlines())
r.close()
else:
print('Twilog is not Found')
tweets = [i for i in tweets if i != '\n']
r = open(user_+'/pazzle_piece.json','w',encoding='utf-8')
for i in tweets:
m = re.search(seiki, i)
if m:
r.write(i)
r.close()
r = open(user_+'/media.json','w',encoding='utf-8')
if os.path.exists(user_+"/photo.json"):
r2 = open(user_+"/photo.json",'r',encoding='utf-8')
photo_ = [str(json.loads(i)["id"]) for i in r2.readlines()]
r2.close()
else:
print('photo is not Found')
photo_ = []
ll = '|'.join(photo_)
for i in tweets:
if "pic" in i:
m = re.search(ll,i)
if not m:
r.write(i)
r.close()
print(user_+": Ended")
def empty_():
pass
def tweet_check(wayback_list):
i = wayback_list.get(block=True)
r = requests.get(i[0])
if r.status_code == 200:
pass
else:
wayback_list2.put(i)
wayback_list.task_done()
if __name__ == '__main__':
if user_check == 1:
while True:
inp = input('ユーザーIDを入れてください')
if inp == "":
continue
user_s = inp.split()
break
elif user_check == 0:
pass
else:
print('0か1を入れてください')
exit()
#Hasyagu_Xd k_oorer_tofu minsaikou
#hi81019
all_check = input('オート(全部オン) Y/N')
for user_ in user_s:
print("現在特定中:"+user_)
print('フォルダ作成中')
os.makedirs(user_, exist_ok=True)
print('フォルダ作成終了')
while True:
if all_check in check_yes:
t1 = threading.Thread(target=twilog_post,args=(user_,))
t2 = threading.Thread(target=id_get,args=(user_,))
t3 = threading.Thread(target=tweet_get,args=(user_,))
t4 = threading.Thread(target=reply_get,args=(user_,))
t6 = threading.Thread(target=follow_get,args=(user_,))
t7 = threading.Thread(target=follower_get,args=(user_,))
t8 = threading.Thread(target=FF_get,args=(user_,))
t5 = threading.Thread(target=retweet_get,args=(user_,))
t9 = threading.Thread(target=wayback_get,args=(user_,))
t10 = threading.Thread(target=twilog_get,args=(user_,))
t11 = threading.Thread(target=photo_get,args=(user_,))
t12 = threading.Thread(target=image_check,args=(user_,))
t13 = threading.Thread(target=pazzle,args=(user_,))
break
elif all_check in check_no:
while True:
twilog_check = input("ツイログを使うかどうか Y/N")
if twilog_check in check_yes:
t1 = threading.Thread(target=twilog_post,args=(user_,))
t2 = threading.Thread(target=id_get,args=(user_,))
t10 = threading.Thread(target=twilog_get,args=(user_,))
break
elif twilog_check in check_no:
t1 = threading.Thread(target=empty_)
t2 = threading.Thread(target=empty_)
t10 = threading.Thread(target=empty_)
break
while True:
twilog_check = input("ツイートを収集するかどうか Y/N")
if twilog_check in check_yes:
t3 = threading.Thread(target=tweet_get,args=(user_,))
t4 = threading.Thread(target=reply_get,args=(user_,))
break
elif twilog_check in check_no:
t3 = threading.Thread(target=empty_)
t4 = threading.Thread(target=empty_)
break
while True:
twilog_check = input("FFを収集するかどうか Y/N")
if twilog_check in check_yes:
t6 = threading.Thread(target=follow_get,args=(user_,))
t7 = threading.Thread(target=follower_get,args=(user_,))
t8 = threading.Thread(target=FF_get,args=(user_,))
break
elif twilog_check in check_no:
t6 = threading.Thread(target=empty_)
t7 = threading.Thread(target=empty_)
t8 = threading.Thread(target=empty_)
break
while True:
twilog_check = input("wayback machineから収集するかどうか Y/N")
if twilog_check in check_yes:
t5 = threading.Thread(target=retweet_get,args=(user_,))
t9 = threading.Thread(target=wayback_get,args=(user_,))
break
elif twilog_check in check_no:
t5 = threading.Thread(target=empty_)
t9 = threading.Thread(target=empty_)
break
while True:
twilog_check = input("画像をダウンロードするかどうか Y/N")
if twilog_check in check_yes:
t11 = threading.Thread(target=photo_get,args=(user_,))
t12 = threading.Thread(target=image_check,args=(user_,))
break
elif twilog_check in check_no:
t11 = threading.Thread(target=empty_)
t12 = threading.Thread(target=empty_)
break
while True:
twilog_check = input("パズピーを出力するかどうか Y/N")
if twilog_check in check_yes:
t13 = threading.Thread(target=pazzle,args=(user_,))
break
elif twilog_check in check_no:
t13 = threading.Thread(target=empty_)
break
break
t1.start()
t2.start()
t3.start()
t4.start()
t5.start()
t6.start()
t7.start()
t8.start()
t9.start()
t10.start()
t11.start()
t12.start()
t13.start()
t11.join()
t13.join()