본문 바로가기

2019년 혁신성장 청년인재 집중양성(빅데이터)/집중양성과정 프로젝트 01

[비트코인 전처리3] 'price'에 관한 코사인 유사도 상위 5만 추출

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
''' Caculate Cosine Similarity about "price"'''
 
# 드라이브 마운트
from google.colab import drive
drive.mount('/content/drive')
################################################
 
# 출력이 너무 길어지지 않게하기 위해 찍지 않도록 했으나 
# 실제 학습 할 때는 아래 두 줄을 주석처리 하는 것을 권장한다.
import warnings
warnings.filterwarnings('ignore')
################################################
 
# 2번에서 완료한 merge_temp_bit(complete).csv를 train 데이터로 옮겨옴 및 결측치 제거
import pandas as pd
 
train = pd.read_csv('/content/drive/My Drive/BIGCOIN/PREPORCOESSING/트레인데이터/merge_temp_bit(complete).csv')
 
################################################
# 데이터 전처리를 위해 KaggleWord2VecUtility 임포트
import re
import nltk
nltk.download('punkt')
import pandas as pd
import numpy as np
 
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
 
from multiprocessing import Pool
 
class KaggleWord2VecUtility(object):
 
    @staticmethod
    def review_to_wordlist(review, remove_stopwords=False):
        # 1. HTML 제거
        review_text = BeautifulSoup(review, "html.parser").get_text()
        # 2. 특수문자를 공백으로 바꿔줌
        review_text = re.sub('[^a-zA-Z]'' ', review_text)
        # 3. 소문자로 변환 후 나눈다.
        words = review_text.lower().split()
        # 4. 불용어 제거
        if remove_stopwords:
            stops = set(stopwords.words('english'))
            words = [w for w in words if not w in stops]
        # 5. 어간추출
        stemmer = SnowballStemmer('english')
        words = [stemmer.stem(w) for w in words]
        # 6. 리스트 형태로 반환
        return(words)
 
    @staticmethod
    def review_to_join_words( review, remove_stopwords=False ):
        words = KaggleWord2VecUtility.review_to_wordlist(\
            review, remove_stopwords=False)
        join_words = ' '.join(words)
        return join_words
 
    @staticmethod
    def review_to_sentences( review, remove_stopwords=False ):
        # punkt tokenizer를 로드한다.
        """
        이 때, pickle을 사용하는데
        pickle을 통해 값을 저장하면 원래 변수에 연결 된 참조값 역시 저장된다.
        저장된 pickle을 다시 읽으면 변수에 연결되었던
        모든 레퍼런스가 계속 참조 상태를 유지한다.
        """
        tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
        # 1. nltk tokenizer를 사용해서 단어로 토큰화 하고 공백 등을 제거한다.
        raw_sentences = tokenizer.tokenize(review.strip())
        # 2. 각 문장을 순회한다.
        sentences = []
        for raw_sentence in raw_sentences:
            # 비어있다면 skip
            if len(raw_sentence) > 0:
                # 태그제거, 알파벳문자가 아닌 것은 공백으로 치환, 불용어제거
                sentences.append(\
                    KaggleWord2VecUtility.review_to_wordlist(\
                    raw_sentence, remove_stopwords))
        return sentences
 
 
    # 속도 개선을 위해 멀티 스레드로 작업하도록
    @staticmethod
    def _apply_df(args):
        df, func, kwargs = args
        return df.apply(func, **kwargs)
 
    @staticmethod
    def apply_by_multiprocessing(df, func, **kwargs):
        # 키워드 항목 중 workers 파라메터를 꺼냄
        workers = kwargs.pop('workers')
        # 위에서 가져온 workers 수로 프로세스 풀을 정의
        pool = Pool(processes=workers)
        # 실행할 함수와 데이터프레임을 워커의 수 만큼 나눠 작업
                for d in np.array_split(df, workers)])
        pool.close()
 
        # 작업 결과를 합쳐서 반환
        return pd.concat(result)
 
################################################
# hour_stamp를 ㅣ준으로 카운트, 값 확인
train.groupby(train["hour_stamp"]).count().head()
 
################################################
 
# 잘 작동 되는지 확인
KaggleWord2VecUtility.review_to_wordlist(train['text'][0])[:10]
 
################################################
# sentence list에 트윗을 전처리하여 문장단위 리스트로 어펜드
sentences = []
for review in train["text"]:
    sentences += KaggleWord2VecUtility.review_to_sentences(
        review, remove_stopwords=False)
 
################################################
# 로깅
 
import logging
logging.basicConfig(
    format='%(asctime)s : %(levelname)s : %(message)s'
    level=logging.INFO)
 
 
#################################################
# 파라메터값 지정
num_features = 300 # 문자 벡터 차원 수
min_word_count = 100 # 최소 문자 수
num_workers = 4 # 병렬 처리 스레드 수
context = 15 # 문자열 창 크기
downsampling = 1e-3 # 문자 빈도 수 Downsample
 
# 초기화 및 모델 학습
from gensim.models import word2vec
 
# 모델 학습
model = word2vec.Word2Vec(sentences, 
                          workers=num_workers, 
                          size=num_features, 
                          min_count=min_word_count,
                          window=context,
                          sample=downsampling, iter =15)
model
 
 
 
 
# 학습이 완료 되면 필요없는 메모리를 unload 시킨다.
model.init_sims(replace=True)
 
model_name = '1st_model_for_cosine'
 
 
 
# 유사도 확인
 
 
 
# 모델 로드
model = Word2Vec.load("/content/drive/My Drive/BIGCOIN/PREPORCOESSING/트레인데이터/1st_model_for_cosine.model")
 
# siimilar_by_vector를 이용하여 'price'와 유사한 단어 5만 단어 선출후 model_50000으로 변수 정
model_50000 =model.similar_by_vector(model['price'], topn=50000, restrict_vocab=None)
 
## Save pickle로 
import pickle
 
## Save pickle로 
with open("model_50000 .pickle","wb") as fw:
 
 
 
http://colorscripter.com/info#e" target="_blank" style="color:#e5e5e5text-decoration:none">Colored by Color Scripter
http://colorscripter.com/info#e" target="_blank" style="text-decoration:none;color:white">cs