search.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439
  1. /*
  2. Copyright 2015 Google Inc. All Rights Reserved.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. // This is a sample web server that uses Cloud Bigtable as the storage layer
  14. // for a simple document-storage and full-text-search service.
  15. // It has three functions:
  16. // - Add a document. This adds the content of a user-supplied document to the
  17. // Bigtable, and adds references to the document to an index in the Bigtable.
  18. // The document is indexed under each unique word in the document.
  19. // - Search the index. This returns documents containing each word in a user
  20. // query, with snippets and links to view the whole document.
  21. // - Clear the table. This deletes and recreates the Bigtable,
  22. package main
  23. import (
  24. "bytes"
  25. "flag"
  26. "fmt"
  27. "html/template"
  28. "io"
  29. "log"
  30. "net/http"
  31. "os"
  32. "strings"
  33. "sync"
  34. "time"
  35. "unicode"
  36. "golang.org/x/net/context"
  37. "google.golang.org/cloud/bigtable"
  38. )
  39. var (
  40. project = flag.String("project", "", "The name of the project.")
  41. zone = flag.String("zone", "", "The zone of the project.")
  42. cluster = flag.String("cluster", "", "The name of the Cloud Bigtable cluster.")
  43. tableName = flag.String("table", "docindex", "The name of the table containing the documents and index.")
  44. credFile = flag.String("creds", "", "File containing credentials")
  45. rebuild = flag.Bool("rebuild", false, "Rebuild the table from scratch on startup.")
  46. client *bigtable.Client
  47. adminClient *bigtable.AdminClient
  48. table *bigtable.Table
  49. addTemplate = template.Must(template.New("").Parse(`<html><body>
  50. Added {{.Title}}
  51. </body></html>`))
  52. contentTemplate = template.Must(template.New("").Parse(`<html><body>
  53. <b>{{.Title}}</b><br><br>
  54. {{.Content}}
  55. </body></html>`))
  56. searchTemplate = template.Must(template.New("").Parse(`<html><body>
  57. Results for <b>{{.Query}}</b>:<br><br>
  58. {{range .Results}}
  59. <a href="/content?name={{.Title}}">{{.Title}}</a><br>
  60. <i>{{.Snippet}}</i><br><br>
  61. {{end}}
  62. </body></html>`))
  63. )
  64. const (
  65. // prototypeTableName is an existing table containing some documents.
  66. // Rebuilding a table will populate it with the data from this table.
  67. prototypeTableName = "shakespearetemplate"
  68. indexColumnFamily = "i"
  69. contentColumnFamily = "c"
  70. mainPage = `
  71. <html>
  72. <head>
  73. <title>Document Search</title>
  74. </head>
  75. <body>
  76. Search for documents:
  77. <form action="/search" method="post">
  78. <div><input type="text" name="q" size=80></div>
  79. <div><input type="submit" value="Search"></div>
  80. </form>
  81. Add a document:
  82. <form action="/add" method="post">
  83. Document name:
  84. <div><textarea name="name" rows="1" cols="80"></textarea></div>
  85. Document text:
  86. <div><textarea name="content" rows="20" cols="80"></textarea></div>
  87. <div><input type="submit" value="Submit"></div>
  88. </form>
  89. Rebuild table:
  90. <form action="/clearindex" method="post">
  91. <div><input type="submit" value="Rebuild"></div>
  92. </form>
  93. </body>
  94. </html>
  95. `
  96. )
  97. func main() {
  98. flag.Parse()
  99. if *tableName == prototypeTableName {
  100. log.Fatal("Can't use " + prototypeTableName + " as your table.")
  101. }
  102. // Let the library get credentials from file.
  103. os.Setenv("GOOGLE_APPLICATION_CREDENTIALS", *credFile)
  104. // Make an admin client.
  105. var err error
  106. if adminClient, err = bigtable.NewAdminClient(context.Background(), *project, *zone, *cluster); err != nil {
  107. log.Fatal("Bigtable NewAdminClient:", err)
  108. }
  109. // Make a regular client.
  110. client, err = bigtable.NewClient(context.Background(), *project, *zone, *cluster)
  111. if err != nil {
  112. log.Fatal("Bigtable NewClient:", err)
  113. }
  114. // Open the table.
  115. table = client.Open(*tableName)
  116. // Rebuild the table if the command-line flag is set.
  117. if *rebuild {
  118. if err := rebuildTable(); err != nil {
  119. log.Fatal(err)
  120. }
  121. }
  122. // Set up HTML handlers, and start the web server.
  123. http.HandleFunc("/search", handleSearch)
  124. http.HandleFunc("/content", handleContent)
  125. http.HandleFunc("/add", handleAddDoc)
  126. http.HandleFunc("/clearindex", handleClear)
  127. http.HandleFunc("/", handleMain)
  128. log.Fatal(http.ListenAndServe(":8080", nil))
  129. }
  130. // handleMain outputs the home page, containing a search box, an "add document" box, and "clear table" button.
  131. func handleMain(w http.ResponseWriter, r *http.Request) {
  132. io.WriteString(w, mainPage)
  133. }
  134. // tokenize splits a string into tokens.
  135. // This is very simple, it's not a good tokenization function.
  136. func tokenize(s string) []string {
  137. wordMap := make(map[string]bool)
  138. f := strings.FieldsFunc(s, func(r rune) bool { return !unicode.IsLetter(r) })
  139. for _, word := range f {
  140. word = strings.ToLower(word)
  141. wordMap[word] = true
  142. }
  143. words := make([]string, 0, len(wordMap))
  144. for word := range wordMap {
  145. words = append(words, word)
  146. }
  147. return words
  148. }
  149. // handleContent fetches the content of a document from the Bigtable and returns it.
  150. func handleContent(w http.ResponseWriter, r *http.Request) {
  151. ctx, _ := context.WithTimeout(context.Background(), 10*time.Second)
  152. name := r.FormValue("name")
  153. if len(name) == 0 {
  154. http.Error(w, "No document name supplied.", http.StatusBadRequest)
  155. return
  156. }
  157. row, err := table.ReadRow(ctx, name)
  158. if err != nil {
  159. http.Error(w, "Error reading content: "+err.Error(), http.StatusInternalServerError)
  160. return
  161. }
  162. content := row[contentColumnFamily]
  163. if len(content) == 0 {
  164. http.Error(w, "Document not found.", http.StatusNotFound)
  165. return
  166. }
  167. var buf bytes.Buffer
  168. if err := contentTemplate.ExecuteTemplate(&buf, "", struct{ Title, Content string }{name, string(content[0].Value)}); err != nil {
  169. http.Error(w, "Error executing HTML template: "+err.Error(), http.StatusInternalServerError)
  170. return
  171. }
  172. io.Copy(w, &buf)
  173. }
  174. // handleSearch responds to search queries, returning links and snippets for matching documents.
  175. func handleSearch(w http.ResponseWriter, r *http.Request) {
  176. ctx, _ := context.WithTimeout(context.Background(), 10*time.Second)
  177. query := r.FormValue("q")
  178. // Split the query into words.
  179. words := tokenize(query)
  180. if len(words) == 0 {
  181. http.Error(w, "Empty query.", http.StatusBadRequest)
  182. return
  183. }
  184. // readRows reads from many rows concurrently.
  185. readRows := func(rows []string) ([]bigtable.Row, error) {
  186. results := make([]bigtable.Row, len(rows))
  187. errors := make([]error, len(rows))
  188. var wg sync.WaitGroup
  189. for i, row := range rows {
  190. wg.Add(1)
  191. go func(i int, row string) {
  192. defer wg.Done()
  193. results[i], errors[i] = table.ReadRow(ctx, row)
  194. }(i, row)
  195. }
  196. wg.Wait()
  197. for _, err := range errors {
  198. if err != nil {
  199. return nil, err
  200. }
  201. }
  202. return results, nil
  203. }
  204. // For each query word, get the list of documents containing it.
  205. results, err := readRows(words)
  206. if err != nil {
  207. http.Error(w, "Error reading index: "+err.Error(), http.StatusInternalServerError)
  208. return
  209. }
  210. // Count how many of the query words each result contained.
  211. hits := make(map[string]int)
  212. for _, r := range results {
  213. for _, r := range r[indexColumnFamily] {
  214. hits[r.Column]++
  215. }
  216. }
  217. // Build a slice of all the documents that matched every query word.
  218. var matches []string
  219. for doc, count := range hits {
  220. if count == len(words) {
  221. matches = append(matches, doc[len(indexColumnFamily+":"):])
  222. }
  223. }
  224. // Fetch the content of those documents from the Bigtable.
  225. content, err := readRows(matches)
  226. if err != nil {
  227. http.Error(w, "Error reading results: "+err.Error(), http.StatusInternalServerError)
  228. return
  229. }
  230. type result struct{ Title, Snippet string }
  231. data := struct {
  232. Query string
  233. Results []result
  234. }{query, nil}
  235. // Output links and snippets.
  236. for i, doc := range matches {
  237. var text string
  238. c := content[i][contentColumnFamily]
  239. if len(c) > 0 {
  240. text = string(c[0].Value)
  241. }
  242. if len(text) > 100 {
  243. text = text[:100] + "..."
  244. }
  245. data.Results = append(data.Results, result{doc, text})
  246. }
  247. var buf bytes.Buffer
  248. if err := searchTemplate.ExecuteTemplate(&buf, "", data); err != nil {
  249. http.Error(w, "Error executing HTML template: "+err.Error(), http.StatusInternalServerError)
  250. return
  251. }
  252. io.Copy(w, &buf)
  253. }
  254. // handleAddDoc adds a document to the index.
  255. func handleAddDoc(w http.ResponseWriter, r *http.Request) {
  256. if r.Method != "POST" {
  257. http.Error(w, "POST requests only", http.StatusMethodNotAllowed)
  258. return
  259. }
  260. ctx, _ := context.WithTimeout(context.Background(), time.Minute)
  261. name := r.FormValue("name")
  262. if len(name) == 0 {
  263. http.Error(w, "Empty document name!", http.StatusBadRequest)
  264. return
  265. }
  266. content := r.FormValue("content")
  267. if len(content) == 0 {
  268. http.Error(w, "Empty document content!", http.StatusBadRequest)
  269. return
  270. }
  271. var (
  272. writeErr error // Set if any write fails.
  273. mu sync.Mutex // Protects writeErr
  274. wg sync.WaitGroup // Used to wait for all writes to finish.
  275. )
  276. // writeOneColumn writes one column in one row, updates err if there is an error,
  277. // and signals wg that one operation has finished.
  278. writeOneColumn := func(row, family, column, value string, ts bigtable.Timestamp) {
  279. mut := bigtable.NewMutation()
  280. mut.Set(family, column, ts, []byte(value))
  281. err := table.Apply(ctx, row, mut)
  282. if err != nil {
  283. mu.Lock()
  284. writeErr = err
  285. mu.Unlock()
  286. }
  287. }
  288. // Start a write to store the document content.
  289. wg.Add(1)
  290. go func() {
  291. writeOneColumn(name, contentColumnFamily, "", content, bigtable.Now())
  292. wg.Done()
  293. }()
  294. // Start writes to store the document name in the index for each word in the document.
  295. words := tokenize(content)
  296. for _, word := range words {
  297. var (
  298. row = word
  299. family = indexColumnFamily
  300. column = name
  301. value = ""
  302. ts = bigtable.Now()
  303. )
  304. wg.Add(1)
  305. go func() {
  306. // TODO: should use a semaphore to limit the number of concurrent writes.
  307. writeOneColumn(row, family, column, value, ts)
  308. wg.Done()
  309. }()
  310. }
  311. wg.Wait()
  312. if writeErr != nil {
  313. http.Error(w, "Error writing to Bigtable: "+writeErr.Error(), http.StatusInternalServerError)
  314. return
  315. }
  316. var buf bytes.Buffer
  317. if err := addTemplate.ExecuteTemplate(&buf, "", struct{ Title string }{name}); err != nil {
  318. http.Error(w, "Error executing HTML template: "+err.Error(), http.StatusInternalServerError)
  319. return
  320. }
  321. io.Copy(w, &buf)
  322. }
  323. // rebuildTable deletes the table if it exists, then creates the table, with the index column family.
  324. func rebuildTable() error {
  325. ctx, _ := context.WithTimeout(context.Background(), 5*time.Minute)
  326. adminClient.DeleteTable(ctx, *tableName)
  327. if err := adminClient.CreateTable(ctx, *tableName); err != nil {
  328. return fmt.Errorf("CreateTable: %v", err)
  329. }
  330. time.Sleep(20 * time.Second)
  331. if err := adminClient.CreateColumnFamily(ctx, *tableName, indexColumnFamily); err != nil {
  332. return fmt.Errorf("CreateColumnFamily: %v", err)
  333. }
  334. if err := adminClient.CreateColumnFamily(ctx, *tableName, contentColumnFamily); err != nil {
  335. return fmt.Errorf("CreateColumnFamily: %v", err)
  336. }
  337. // Open the prototype table. It contains a number of documents to get started with.
  338. prototypeTable := client.Open(prototypeTableName)
  339. var (
  340. writeErr error // Set if any write fails.
  341. mu sync.Mutex // Protects writeErr
  342. wg sync.WaitGroup // Used to wait for all writes to finish.
  343. )
  344. copyRowToTable := func(row bigtable.Row) bool {
  345. mu.Lock()
  346. failed := writeErr != nil
  347. mu.Unlock()
  348. if failed {
  349. return false
  350. }
  351. mut := bigtable.NewMutation()
  352. for family, items := range row {
  353. for _, item := range items {
  354. // Get the column name, excluding the column family name and ':' character.
  355. columnWithoutFamily := item.Column[len(family)+1:]
  356. mut.Set(family, columnWithoutFamily, bigtable.Now(), item.Value)
  357. }
  358. }
  359. wg.Add(1)
  360. go func() {
  361. // TODO: should use a semaphore to limit the number of concurrent writes.
  362. if err := table.Apply(ctx, row.Key(), mut); err != nil {
  363. mu.Lock()
  364. writeErr = err
  365. mu.Unlock()
  366. }
  367. wg.Done()
  368. }()
  369. return true
  370. }
  371. // Create a filter that only accepts the column families we're interested in.
  372. filter := bigtable.FamilyFilter(indexColumnFamily + "|" + contentColumnFamily)
  373. // Read every row from prototypeTable, and call copyRowToTable to copy it to our table.
  374. err := prototypeTable.ReadRows(ctx, bigtable.InfiniteRange(""), copyRowToTable, bigtable.RowFilter(filter))
  375. wg.Wait()
  376. if err != nil {
  377. return err
  378. }
  379. return writeErr
  380. }
  381. // handleClear calls rebuildTable
  382. func handleClear(w http.ResponseWriter, r *http.Request) {
  383. if r.Method != "POST" {
  384. http.Error(w, "POST requests only", http.StatusMethodNotAllowed)
  385. return
  386. }
  387. if err := rebuildTable(); err != nil {
  388. http.Error(w, "Failed to rebuild index: "+err.Error(), http.StatusInternalServerError)
  389. return
  390. }
  391. fmt.Fprint(w, "Rebuilt index.\n")
  392. }