본문 바로가기

2019년 혁신성장 청년인재 집중양성(빅데이터)/집중양성과정 프로젝트 01

[비트코인 전처리4] 가격 증감을 바탕으로 단어 긍정지수 산출

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
#드라이브 마운트
from google.colab import drive
drive.mount('/content/drive')
 
 
 
 
#merge_temp_bit(complete)를 train으로 놈
import pandas as pd
 
train = pd.read_csv('/content/drive/My Drive/BIGCOIN/PREPORCOESSING/트레인데이터/merge_temp_bit(complete).csv')
 
 
 
 
# train 데이터 인덱싱 번호가 정렬 되어 있지 않기 때문에, 정렬 
 
index = [i for i in range(len(train))]
train = train[["text""gap_weighted_price"]]
train['index']= index
train =train.reset_index()
 
 
 
 
 
# KaggleWord2VecUtility 임포트
 
import re
import nltk
nltk.download('punkt')
import pandas as pd
import numpy as np
 
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
 
from multiprocessing import Pool
 
class KaggleWord2VecUtility(object):
 
    @staticmethod
    def review_to_wordlist(review, remove_stopwords=False):
        # 1. HTML 제거
        review_text = BeautifulSoup(review, "html.parser").get_text()
        # 2. 특수문자를 공백으로 바꿔줌
        review_text = re.sub('[^a-zA-Z]'' ', review_text)
        # 3. 소문자로 변환 후 나눈다.
        words = review_text.lower().split()
        # 4. 불용어 제거
        if remove_stopwords:
            stops = set(stopwords.words('english'))
            words = [w for w in words if not w in stops]
        # 5. 어간추출
        stemmer = SnowballStemmer('english')
        words = [stemmer.stem(w) for w in words]
        # 6. 리스트 형태로 반환
        return(words)
 
    @staticmethod
    def review_to_join_words( review, remove_stopwords=False ):
        words = KaggleWord2VecUtility.review_to_wordlist(\
            review, remove_stopwords=False)
        join_words = ' '.join(words)
        return join_words
 
    @staticmethod
    def review_to_sentences( review, remove_stopwords=False ):
        # punkt tokenizer를 로드한다.
        """
        이 때, pickle을 사용하는데
        pickle을 통해 값을 저장하면 원래 변수에 연결 된 참조값 역시 저장된다.
        저장된 pickle을 다시 읽으면 변수에 연결되었던
        모든 레퍼런스가 계속 참조 상태를 유지한다.
        """
        tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
        # 1. nltk tokenizer를 사용해서 단어로 토큰화 하고 공백 등을 제거한다.
        raw_sentences = tokenizer.tokenize(review.strip())
        # 2. 각 문장을 순회한다.
        sentences = []
        for raw_sentence in raw_sentences:
            # 비어있다면 skip
            if len(raw_sentence) > 0:
                # 태그제거, 알파벳문자가 아닌 것은 공백으로 치환, 불용어제거
                sentences.append(\
                    KaggleWord2VecUtility.review_to_wordlist(\
                    raw_sentence, remove_stopwords))
        return sentences
 
 
    # 속도 개선을 위해 멀티 스레드로 작업하도록
    @staticmethod
    def _apply_df(args):
        df, func, kwargs = args
        return df.apply(func, **kwargs)
 
    @staticmethod
    def apply_by_multiprocessing(df, func, **kwargs):
        # 키워드 항목 중 workers 파라메터를 꺼냄
        workers = kwargs.pop('workers')
        # 위에서 가져온 workers 수로 프로세스 풀을 정의
        pool = Pool(processes=workers)
        # 실행할 함수와 데이터프레임을 워커의 수 만큼 나눠 작업
                for d in np.array_split(df, workers)])
        pool.close()
        # 작업 결과를 합쳐서 반환
        return pd.concat(result)
 
 
 
# 멀티 프로세싱
from multiprocessing import Pool
import numpy as np
 
def _apply_df(args):
    df, func, kwargs = args
    return df.apply(func, **kwargs)
 
def apply_by_multiprocessing(df, func, **kwargs):
    # 키워드 항목 중 workers 파라메터를 꺼냄
    workers = kwargs.pop('workers')
    # 위에서 가져온 workers 수로 프로세스 풀을 정의
    pool = Pool(processes=workers)
    # 실행할 함수와 데이터프레임을 워커의 수 만큼 나눠 작업
            for d in np.array_split(df, workers)])
    pool.close()
    # 작업 결과를 합쳐서 반환
    return pd.concat(list(result))
 
 
 
# 카운터벡터라이저 설정, 데이터 크기가 너무 커지기 때문에 min_df 를 150으로 설정
 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
 
# 튜토리얼과 다르게 파라메터 값을 수정
# 파라메터 값만 수정해도 캐글 스코어 차이가 많이 남
vectorizer = CountVectorizer(analyzer = 'word'
                             tokenizer = None,
                             preprocessor = None, 
                             stop_words = None, 
                             min_df = 150# 토큰이 나타날 최소 문서 개수
                             ngram_range=(1,1),
                             max_features = 50000
                            )
vectorizer
 
 
# 여기에서는 하나의 과정만 묶어주어 pipeline이 불필요 할 수도 있습니다.
# pipeline은 feature engineering의 여러 과정을 묶어 줄 때 사용합니다.
pipeline = Pipeline([
    ('vect', vectorizer),
])  
 
 
 
 
 
 
# 클린 데이터 dtm생성
%time train_data_features = pipeline.fit_transform(clean_train_reviews)
train_data_features
 
 
 
#피쳐 이름 저장
features = vectorizer.get_feature_names()
 
 
 
#데이터 프레임화
dtm_df=pd.DataFrame(data=np.asarray(train_data_features.toarray()),    # values
...                  # 1st column as index
...              columns=features) 
 
 
 
 
# train에서 gap_weighted_price의 부호에 따라 index생성 후, index를 리스트화 시킨 후, dtm_df슬라이싱에 이용. 이후 각 피쳐별 합을 구함
 
dtm_minus_sum=dtm_df.iloc[train.loc[train['gap_weighted_price']<0'index'    ].tolist()].sum()
dtm_plus_sum=dtm_df.iloc[train.loc[train['gap_weighted_price']>0'index'    ].tolist()].sum()
 
 
 
# btc_index를 계산
btc_index=(dtm_plus_sum -dtm_minus_sum)/(dtm_minus_sum+dtm_plus_sum)
 
 
# 개수 확인
len(btc_index)
 
 
 
# 피클로 
import pickle
 
## Save pickle
with open("dtm_minus_sum.pickle","wb") as fw:
with open("dtm_plus_sum.pickle","wb") as fa:
with open("btc_index.pickle","wb") as fb:
 
 
 
 
## Load pickle
'''with open("data.pickle","rb") as fr:
    data = pickle.load(fr)
print(data)
'''
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
http://colorscripter.com/info#e" target="_blank" style="color:#e5e5e5text-decoration:none">Colored by Color Scripter
http://colorscripter.com/info#e" target="_blank" style="text-decoration:none;color:white">cs

**만약 실제 가격 증감을 이용하여 계산한다면? 조금 더 다이나믹한 점수가 나오지 않을까?

가격 증가시 똑같은 단어 A가 나왔다고 할지라도, A단어가 가격 증가 3일 때 출현했을 떄와 5일 때 출현했을 때가 있을 테니까... 그것까지 같이 고려해준다면?