2009-01-16 00:05:55 +01:00
require 'rubygems'
require 'couchrest'
2008-05-25 20:49:37 +02:00
2008-12-14 12:05:02 +01:00
couch = CouchRest . new ( " http://127.0.0.1:5984 " )
2008-05-25 20:49:37 +02:00
db = couch . database ( 'word-count-example' )
2008-06-02 00:07:56 +02:00
db . delete! rescue nil
db = couch . create_db ( 'word-count-example' )
2008-05-25 20:49:37 +02:00
2008-06-20 22:36:36 +02:00
books = {
'outline-of-science.txt' = > 'http://www.gutenberg.org/files/20417/20417.txt' ,
'ulysses.txt' = > 'http://www.gutenberg.org/dirs/etext03/ulyss12.txt' ,
'america.txt' = > 'http://www.gutenberg.org/files/16960/16960.txt' ,
'da-vinci.txt' = > 'http://www.gutenberg.org/dirs/etext04/7ldv110.txt'
}
books . each do | file , url |
pathfile = File . join ( File . dirname ( __FILE__ ) , file )
` curl #{ url } > #{ pathfile } ` unless File . exists? ( pathfile )
end
books . keys . each do | book |
2008-05-25 20:49:37 +02:00
title = book . split ( '.' ) [ 0 ]
puts title
File . open ( File . join ( File . dirname ( __FILE__ ) , book ) , 'r' ) do | file |
lines = [ ]
chunk = 0
while line = file . gets
lines << line
2008-06-01 19:08:38 +02:00
if lines . length > 10
2008-05-25 20:49:37 +02:00
db . save ( {
:title = > title ,
:chunk = > chunk ,
:text = > lines . join ( '' )
} )
chunk += 1
2008-06-02 18:37:47 +02:00
puts chunk
2008-05-25 20:49:37 +02:00
lines = [ ]
end
end
end
end
2008-12-14 12:05:02 +01:00
# puts "The books have been stored in your CouchDB. To initiate the MapReduce process, visit http://127.0.0.1:5984/_utils/ in your browser and click 'word-count-example', then select view 'words' or 'count'. The process could take about 15 minutes on an average MacBook."
2009-01-16 00:05:55 +01:00
2008-05-25 20:49:37 +02:00