본문 바로가기

2019년 혁신성장 청년인재 집중양성(빅데이터)/집중양성과정 프로젝트 01

[비트코인 전처리2] 트윗 데이터에 가격 태그 붙이기

 

 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
'''Adding pirce features to twit data'''
 
#구글 드라이브 마운트
 
from google.colab import drive
drive.mount('/content/drive')
 
 
# 라이브러리 임포트
 
import pandas as pd
import numpy as np
from tqdm import tqdm 
 
#데이터 임포트, temp는 100개 선별 데이터.
 
temp = pd.read_csv("/content/drive/My Drive/BIGCOIN/PREPORCOESSING/트레인데이터/temp_100_selected.csv")
bitstamp = pd.read_csv("/content/drive/My Drive/BIGCOIN/PREPORCOESSING/bitcoin-historical-data/bitstampUSD_1-min_data_2012-01-01_to_2019-08-12.csv")
coinbase = pd.read_csv("/content/drive/My Drive/BIGCOIN/PREPORCOESSING/bitcoin-historical-data/coinbaseUSD_1-min_data_2014-12-01_to_2019-01-09.csv")
 
 
#데이터 결측치 제거 및 인덱스 초기화
 
 
bitstamp_na = bitstamp.dropna(axis=0)
bitstamp_dist=bitstamp_na.drop_duplicates()
bitstamp_reset= bitstamp_dist.reset_index(drop=True)
 
 
coinbase_na = coinbase.dropna(axis=0)
coinbase_dist=coinbase_na.drop_duplicates()
coinbase_reset= coinbase_dist.reset_index(drop=True) 
 
 
# 가격데이터와, 트레인 데이터 조인으로 필요한 데이터만 선택
 
 
temp = temp[["text""hour_stamp"]]
bitstamp_reset=bitstamp_reset.rename(columns = {'Timestamp':'hour_stamp'})
coinbase_reset=coinbase_reset.rename(columns = {'Timestamp':'hour_stamp'})
 
merge_temp_bit = pd.merge(temp, bitstamp_reset, how = 'left', on ='hour_stamp')
merge_temp_co = pd.merge(temp, coinbase_reset, how = 'left', on ='hour_stamp')
 
 
 
#결측치가 bitstamp가 더 적음, bitstamp 선택
 
 
 
# 본래의 bitstamp열만 분리
bitstamp_set =merge_temp_bit[["hour_stamp","Open","High","Low",'Close','Volume_(BTC)','Volume_(Currency)''Weighted_Price']]
 
 
#중복 제거
bitstamp_set=bitstamp_set.drop_duplicates()
bitstamp_set= bitstamp_set.reset_index(drop=True)
 
 
 
 
 
# 차잇값 계산 및 추가
#for i in tqdm(range(len(bitstamp)) leave=False):
gap_Open =[bitstamp_set["Open"][i]-bitstamp_set["Open"][i-1for i in tqdm(range(1len(bitstamp_set)), miniters=500000)]
  #gap_Open.append(bitstamp["Open"][i]-bitstamp["Open"][i-1])
 
gap_High =[bitstamp_set["High"][i]-bitstamp_set["High"][i-1for i in tqdm(range(1len(bitstamp_set)),miniters=500000)]
  #gap_High.append(bitstamp["High"][i]-bitstamp["High"][i-1])
  
gap_Low =[bitstamp_set["Low"][i]-bitstamp_set["Low"][i-1for i in tqdm(range(1len(bitstamp_set)),miniters=500000)]
  #gap_Low.append(bitstamp["Low"][i]-bitstamp["Low"][i-1])
 
gap_vol_BTC =[bitstamp_set["Volume_(BTC)"][i]-bitstamp_set["Volume_(BTC)"][i-1for i in tqdm(range(1len(bitstamp_set)),miniters=500000)]
  #gap_vol_BTC.append(bitstamp["Volume_(BTC)"][i]-bitstamp["Volume_(BTC)"][i-1])
 
gap_vol_Currency =[bitstamp_set["Volume_(Currency)"][i]-bitstamp_set["Volume_(Currency)"][i-1for i in tqdm(range(1len(bitstamp_set)),miniters=500000)]
  #gap_vol_Currency.append(bitstamp["Volume_(Currency)"][i]-bitstamp["Volume_(Currency)"][i-1])
 
gap_weighted_price =[bitstamp_set["Weighted_Price"][i]-bitstamp_set["Weighted_Price"][i-1for i in tqdm(range(1len(bitstamp_set)),miniters=500000)]
  #gap_weighted_price .append(bitstamp["Weighted_Price"][i]-bitstamp["Weighted_Price"][i-1])
 
# 첫째 원소 추가, 첫째 값은 계산이 불가능
 
 
 
 
#bitstamp_set 데이터프레임에 열 추가
bitstamp_set["gap_Open"= gap_Open
bitstamp_set["gap_High"= gap_High
bitstamp_set["gap_Low"= gap_Low
bitstamp_set["gap_vol_BTC"= gap_vol_BTC
bitstamp_set["gap_vol_Currency"= gap_vol_Currency
bitstamp_set["gap_weighted_price"= gap_weighted_price
 
 
 
# 부호 계산 및 열 추가
 
  # BTC Open 부호
 #for i in range(len(bitstamp)):
 
 sign_gap_Open =[-1 if bitstamp_set["gap_Open"][i] < 0  else 1 if bitstamp_set["gap_Open"][i] > 0 else 0 for i in tqdm(range(len(bitstamp_set)))]
 
 ''' #if bitstamp["gap_Open"][i] < 0:
  elif bitstamp["gap_Open"][i] > 0:
  elif bitstamp["gap_Open"][i] == 0:
'''
 
 
    # BTC High 부호
 '''   
  if bitstamp["gap_High"][i] < 0:
  elif bitstamp["gap_High"][i] > 0:
  elif bitstamp["gap_High"][i] == 0:
'''
 
 sign_gap_High =[-1 if bitstamp_set["gap_High"][i] < 0  else 1 if bitstamp_set["gap_High"][i] > 0 else 0 for i in tqdm(range(len(bitstamp_set)))]
 
 
    # BTC Low 부호
'''    
  if bitstamp["gap_Low"][i] < 0:
  elif bitstamp["gap_Low"][i] > 0:
  elif bitstamp["gap_Low"][i] == 0:
'''
sign_gap_Low =[-1 if bitstamp_set["gap_Low"][i] < 0  else 1 if bitstamp_set["gap_Low"][i] > 0 else 0 for i in tqdm(range(len(bitstamp_set)))]
 
 
 
  # BTC 거래량 부호
 ''
  if bitstamp["Volume_(BTC)"][i] < 0:
  elif bitstamp["Volume_(BTC)"][i] > 0:
  elif bitstamp["Volume_(BTC)"][i] == 0:
  '''
sign_vol_BTC =[-1 if bitstamp_set["Volume_(BTC)"][i] < 0  else 1 if bitstamp_set["Volume_(BTC)"][i] > 0 else 0 for i in tqdm(range(len(bitstamp_set)))]
 
 
  #통화 거래량 부호
 ''
  if bitstamp["gap_vol_Currency"][i] < 0:
  elif bitstamp["gap_vol_Currency"][i] > 0:
  elif bitstamp["gap_vol_Currency"][i] == 0:
  '''
sign_vol_currency =[-1 if bitstamp_set["gap_vol_Currency"][i] < 0  else 1 if bitstamp_set["gap_vol_Currency"][i] > 0 else 0 for i in tqdm(range(len(bitstamp_set)))]
 
  #가격 변화량 부호
 ''
  if bitstamp["Weighted_Price"][i] < 0:
  elif bitstamp["Weighted_Price"][i] > 0:
  elif bitstamp["Weighted_Price"][i] == 0:
 '''
 
 sign_vol_weighted_price =[-1 if bitstamp_set["Weighted_Price"][i] < 0  else 1 if bitstamp_set["Weighted_Price"][i] > 0 else 0 for i in tqdm(range(len(bitstamp_set)))]
 
 
# bitstamp 데이터 프레임에 열 추가
bitstamp_set["sign_gap_Open"= sign_gap_Open
bitstamp_set["sign_gap_High"= sign_gap_High
bitstamp_set["sign_gap_Low"= sign_gap_Low
bitstamp_set["sign_vol_BTC"= sign_vol_BTC
bitstamp_set["sign_vol_currency"= sign_vol_currency
bitstamp_set["sign_vol_weighted_price"= sign_vol_weighted_price
 
# train 필요한 열만 선택
 
# 다시 조인해서 데이터 붙여주기
merge_temp_bit = pd.merge(temp, bitstamp_set, how = 'left', on ='hour_stamp')
 
 
# 0값으로 있는 중립치에 대해서 제거
 
merge_temp_bit = merge_temp_bit.dropna(axis=0)
merge_temp_bit= merge_temp_bit.reset_index(drop=True)
 
 
# 결측치 확인
 
 
#저장
merge_temp_bit.to_csv("merge_temp_bit(complete).csv")
 
 
 
 
 
http://colorscripter.com/info#e" target="_blank" style="color:#e5e5e5text-decoration:none">Colored by Color Scripter
http://colorscripter.com/info#e" target="_blank" style="text-decoration:none;color:white">cs