问题
I am making a web crawler. I'm passing the url through a crawler function and parsing it to get all the links in the anchor tag, then I am invoking same crawler function for all those urls using seperate goroutine for every url.
But if if send a request and cancel it before I get the response, all the groutines for that particular request are still running.
Now what I want is that when I cancel the request all the goroutines that got invoked due to that request stops.
Please guide.
Following is my code for the crawler function.
func crawler(c echo.Context, urlRec string, feed chan string, urlList *[]string, wg *sync.WaitGroup) {
defer wg.Done()
URL, _ := url.Parse(urlRec)
response, err := http.Get(urlRec)
if err != nil {
log.Print(err)
return
}
body := response.Body
defer body.Close()
tokenizer := html.NewTokenizer(body)
flag := true
for flag {
tokenType := tokenizer.Next()
switch {
case tokenType == html.ErrorToken:
flag = false
break
case tokenType == html.StartTagToken:
token := tokenizer.Token()
// Check if the token is an <a> tag
isAnchor := token.Data == "a"
if !isAnchor {
continue
}
ok, urlHref := getReference(token)
if !ok {
continue
}
// Make sure the url begines in http**
hasProto := strings.Index(urlHref, "http") == 0
if hasProto {
if !urlInURLList(urlHref, urlList) {
if strings.Contains(urlHref, URL.Host) {
*urlList = append(*urlList, urlHref)
// fmt.Println(urlHref)
// c.String(http.StatusOK, urlHref+"\n")Documents
if !checkExt(filepath.Ext(urlHref)) {
wg.Add(1)
go crawler(c, urlHref, feed, urlList, wg)
}
}
}
}
}
}
}
And following is my POST request handler
func scrapePOST(c echo.Context) error {
var urlList []string
urlSession := urlFound{}
var wg sync.WaitGroup
urlParam := c.FormValue("url")
feed := make(chan string, 1000)
wg.Add(1)
go crawler(c, urlParam, feed, &urlList, &wg)
wg.Wait()
var count = 0
for _, url := range urlList {
if filepath.Ext(url) == ".jpg" || filepath.Ext(url) == ".jpeg" || filepath.Ext(url) == ".png" {
urlSession.Images = append(urlSession.Images, url)
} else if filepath.Ext(url) == ".doc" || filepath.Ext(url) == ".docx" || filepath.Ext(url) == ".pdf" || filepath.Ext(url) == ".ppt" {
urlSession.Documents = append(urlSession.Documents, url)
} else {
urlSession.Links = append(urlSession.Links, url)
}
count = count + 1
}
urlSession.Count = count
// jsonResp, _ := json.Marshal(urlSession)
// fmt.Print(urlSession)
return c.JSON(http.StatusOK, urlSession)
}
回答1:
The echo context exposes the HTTP request, which has a context tied to the server request already. Just get that context, and check it for cancellation, and/or pass it along to methods that take a context.
ctx := c.Request().Context()
select {
case <-ctx.Done():
return ctx.Err()
default:
// Continue handling the request
}
// and pass along to the db or whatever else:
rows, err := db.QueryContext(ctx, ...)
If the client aborts the connection, the Request-scoped context will automatically be cancelled.
If you want to add your own cancellation conditions, (timeouts, or whatever) you can do that, too:
req := c.Request()
ctx, cancel := context.WithCancel(req.Context())
req.WithContext(ctx)
defer cancel()
// do stuff, which may conditionally call cancel() to cancel the context early
来源:https://stackoverflow.com/questions/45525332/close-all-goroutines-when-http-request-is-cancelled