sun的程序員也是程序員??!(續)
剛剛鄙視完sun,繼續performance tuning,結果又發現問題:測試中,偶爾會出現一些古怪錯誤,經檢查,發現有以下可疑的異常:[#|2010-05-05T14:27:37.295+0800|WARNING|sun-glassfish-comms-server2.0|com.sun.jbi.httpsoapbc.OutboundMessageProcessor|_ThreadID=53;_ThreadName=HTTPBC-OutboundReceiver-47;_RequestID=42321e60-6723-4831-a99a-b4dd1ac3e35f;|HTTPBC-E00759: An exception occured while processing a reply message. HTTP transport error: java.util.ConcurrentModificationException
com.sun.xml.ws.client.ClientTransportException: HTTP transport error: java.util.ConcurrentModificationException
at com.sun.xml.ws.transport.http.client.HttpClientTransport.getOutput(HttpClientTransport.java:134)
at com.sun.xml.ws.transport.http.client.HttpTransportPipe.process(HttpTransportPipe.java:143)
at com.sun.xml.ws.transport.http.client.HttpTransportPipe.processRequest(HttpTransportPipe.java:89)
at com.sun.xml.ws.transport.DeferredTransportPipe.processRequest(DeferredTransportPipe.java:91)
at com.sun.xml.ws.api.pipe.Fiber.__doRun(Fiber.java:595)
at com.sun.xml.ws.api.pipe.Fiber._doRun(Fiber.java:554)
at com.sun.xml.ws.api.pipe.Fiber.doRun(Fiber.java:539)
at com.sun.xml.ws.api.pipe.Fiber.runSync(Fiber.java:436)
at com.sun.xml.ws.api.pipe.helper.AbstractTubeImpl.process(AbstractTubeImpl.java:106)
at com.sun.xml.ws.tx.client.TxClientPipe.process(TxClientPipe.java:177)
at com.sun.xml.ws.api.pipe.helper.PipeAdapter.processRequest(PipeAdapter.java:115)
at com.sun.xml.ws.api.pipe.Fiber.__doRun(Fiber.java:595)
at com.sun.xml.ws.api.pipe.Fiber._doRun(Fiber.java:554)
at com.sun.xml.ws.api.pipe.Fiber.doRun(Fiber.java:539)
at com.sun.xml.ws.api.pipe.Fiber.runSync(Fiber.java:436)
at com.sun.xml.ws.client.Stub.process(Stub.java:248)
at com.sun.xml.ws.client.dispatch.DispatchImpl.doInvoke(DispatchImpl.java:180)
at com.sun.xml.ws.client.dispatch.DispatchImpl.invoke(DispatchImpl.java:206)
at com.sun.jbi.httpsoapbc.OutboundMessageProcessor.outboundCall(OutboundMessageProcessor.java:1256)
at com.sun.jbi.httpsoapbc.OutboundMessageProcessor.dispatch(OutboundMessageProcessor.java:1296)
at com.sun.jbi.httpsoapbc.OutboundMessageProcessor.processRequestReplyOutbound(OutboundMessageProcessor.java:747)
at com.sun.jbi.httpsoapbc.OutboundMessageProcessor.processMessage(OutboundMessageProcessor.java:257)
at com.sun.jbi.httpsoapbc.OutboundAction.run(OutboundAction.java:63)
at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:886)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:908)
at java.lang.Thread.run(Thread.java:619)
Caused by: java.util.ConcurrentModificationException
at java.util.HashMap$HashIterator.nextEntry(HashMap.java:793)
at java.util.HashMap$EntryIterator.next(HashMap.java:834)
at java.util.HashMap$EntryIterator.next(HashMap.java:832)
at com.sun.xml.ws.transport.http.client.HttpClientTransport.createHttpConnection(HttpClientTransport.java:364)
at com.sun.xml.ws.transport.http.client.HttpClientTransport.getOutput(HttpClientTransport.java:118)
... 25 more
com.sun.xml.ws.client.ClientTransportException: HTTP transport error: java.util.ConcurrentModificationException
at com.sun.xml.ws.transport.http.client.HttpClientTransport.getOutput(HttpClientTransport.java:134)
at com.sun.xml.ws.transport.http.client.HttpTransportPipe.process(HttpTransportPipe.java:143)
at com.sun.xml.ws.transport.http.client.HttpTransportPipe.processRequest(HttpTransportPipe.java:89)
at com.sun.xml.ws.transport.DeferredTransportPipe.processRequest(DeferredTransportPipe.java:91)
at com.sun.xml.ws.api.pipe.Fiber.__doRun(Fiber.java:595)
at com.sun.xml.ws.api.pipe.Fiber._doRun(Fiber.java:554)
at com.sun.xml.ws.api.pipe.Fiber.doRun(Fiber.java:539)
at com.sun.xml.ws.api.pipe.Fiber.runSync(Fiber.java:436)
at com.sun.xml.ws.api.pipe.helper.AbstractTubeImpl.process(AbstractTubeImpl.java:106)
at com.sun.xml.ws.tx.client.TxClientPipe.process(TxClientPipe.java:177)
at com.sun.xml.ws.api.pipe.helper.PipeAdapter.processRequest(PipeAdapter.java:115)
at com.sun.xml.ws.api.pipe.Fiber.__doRun(Fiber.java:595)
at com.sun.xml.ws.api.pipe.Fiber._doRun(Fiber.java:554)
at com.sun.xml.ws.api.pipe.Fiber.doRun(Fiber.java:539)
at com.sun.xml.ws.api.pipe.Fiber.runSync(Fiber.java:436)
at com.sun.xml.ws.client.Stub.process(Stub.java:248)
at com.sun.xml.ws.client.dispatch.DispatchImpl.doInvoke(DispatchImpl.java:180)
at com.sun.xml.ws.client.dispatch.DispatchImpl.invoke(DispatchImpl.java:206)
at com.sun.jbi.httpsoapbc.OutboundMessageProcessor.outboundCall(OutboundMessageProcessor.java:1256)
at com.sun.jbi.httpsoapbc.OutboundMessageProcessor.dispatch(OutboundMessageProcessor.java:1296)
at com.sun.jbi.httpsoapbc.OutboundMessageProcessor.processRequestReplyOutbound(OutboundMessageProcessor.java:747)
at com.sun.jbi.httpsoapbc.OutboundMessageProcessor.processMessage(OutboundMessageProcessor.java:257)
at com.sun.jbi.httpsoapbc.OutboundAction.run(OutboundAction.java:63)
at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:886)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:908)
at java.lang.Thread.run(Thread.java:619)
Caused by: java.util.ConcurrentModificationException
at java.util.HashMap$HashIterator.nextEntry(HashMap.java:793)
at java.util.HashMap$EntryIterator.next(HashMap.java:834)
at java.util.HashMap$EntryIterator.next(HashMap.java:832)
at com.sun.xml.ws.transport.http.client.HttpClientTransport.createHttpConnection(HttpClientTransport.java:364)
at com.sun.xml.ws.transport.http.client.HttpClientTransport.getOutput(HttpClientTransport.java:118)
... 25 more
從異常上看,調用metro進行webservice的調用過程中,有對hashmap做游歷進行讀取的操作,期間拋出ConcurrentModificationException。感覺這個又是bug了,畢竟這個前前后后的代碼,都是sun自己的:openESB, metro, jdk。
找到hashmap拋出異常的代碼:
final Entry<K,V> nextEntry() {
if (modCount != expectedModCount)
throw new ConcurrentModificationException();
if (modCount != expectedModCount)
throw new ConcurrentModificationException();
明顯這個ConcurrentModificationException是hashmap主動拋出的,看條件if (modCount != expectedModCount)
找到expectedModCount,HashIterator構造時初始化為當時hashmap實例的modCount并保持不再修改,實際就是記下迭代開始時hashmap的狀態:
HashIterator() {
expectedModCount = modCount;
...
}
expectedModCount = modCount;
...
}
再看modCount相關代碼
/**
* The number of times this HashMap has been structurally modified
* Structural modifications are those that change the number of mappings in
* the HashMap or otherwise modify its internal structure (e.g.,
* rehash). This field is used to make iterators on Collection-views of
* the HashMap fail-fast. (See ConcurrentModificationException).
*/
transient volatile int modCount;
從javadoc中可以得知modCount是用來記錄hashmap實例的結構被修改的次數。同時明確指出這個域用來在做迭代時實現fail-fast。
現在非常明確的可以知道問題的來源了:在hashmap實例的坐迭代的過程中,其他線程修改了這個hashmap,導致modCount 和 expectedModCount不符,因此直接拋出ConcurrentModificationException)來實現fail-fast。
hashmap的這個做法沒有問題,那么問題就是出在它的使用者上了:為什么在hashmap進行迭代的過程中,會修改這個hashmap?而且,明顯的沒有做同步保護,要知道hashmap是明確申明不是線程安全的。
先找到這個hashmap被調用的代碼,metro下的HttpTransportPipe,方便起見請打開以下URL使用fisheye工具查看代碼:
http://fisheye5.cenqua.com/browse/jax-ws-sources/jaxws-ri/rt/src/com/sun/xml/ws/transport/http/client/HttpTransportPipe.java?r=1.14
public Packet process(Packet request) {
HttpClientTransport con;
try {
// get transport headers from message
Map<String, List<String>> reqHeaders = (Map<String, List<String>>) request.invocationProperties.get(MessageContext.HTTP_REQUEST_HEADERS);
//assign empty map if its null
if(reqHeaders == null){
reqHeaders = new HashMap<String, List<String>>();
}
......
for (Map.Entry<String, List<String>> entry : reqHeaders.entrySet()) {
httpConnection.addRequestProperty(entry.getKey(), entry.getValue().get(0));
}
HttpClientTransport con;
try {
// get transport headers from message
Map<String, List<String>> reqHeaders = (Map<String, List<String>>) request.invocationProperties.get(MessageContext.HTTP_REQUEST_HEADERS);
//assign empty map if its null
if(reqHeaders == null){
reqHeaders = new HashMap<String, List<String>>();
}
......
for (Map.Entry<String, List<String>> entry : reqHeaders.entrySet()) {
httpConnection.addRequestProperty(entry.getKey(), entry.getValue().get(0));
}
出現問題的reqHeaders是從request中獲取到的,明顯是這個方法之外還有其他線程在修改這個hashmap。
簡單修改一下代碼:
public Packet process(Packet request) {
HttpClientTransport con;
try {
// get transport headers from message
Map<String, List<String>> reqHeaders = new HashMap<String, List<String>>();
Map<String, List<String>> reqHeadersInRequest = (Map<String, List<String>>) request.invocationProperties.get(MessageContext.HTTP_REQUEST_HEADERS);
//assign empty map if its null
if(reqHeadersInRequest != null){
reqHeaders.putAll(reqHeadersInRequest);
}
HttpClientTransport con;
try {
// get transport headers from message
Map<String, List<String>> reqHeaders = new HashMap<String, List<String>>();
Map<String, List<String>> reqHeadersInRequest = (Map<String, List<String>>) request.invocationProperties.get(MessageContext.HTTP_REQUEST_HEADERS);
//assign empty map if its null
if(reqHeadersInRequest != null){
reqHeaders.putAll(reqHeadersInRequest);
}
不直接使用原有的hashmap實例了,既然其他線程會同時進行修改操作,那么這個實例就是很不安全的了。我們重新new了一個新的HashMap,然后將原有HashMap的數據用putAll方法設置進入。用編譯后的class文件覆蓋glassfish/lib/webservice-rt.jar中的同名文件,重新測試,跑了20分鐘,上述的ConcurrentModificationException異常沒有再出現。
總結一下這個bug反映的問題,sun的開發人員在metro中是這樣使用hashmap:
1. 將hashmap按照引用在各個實例間傳遞
2. 在不同地方有不同線程同時讀寫
3. 讀寫時不加鎖,不做同步保護
很無語,這種做法,不是自己找死嗎?hashmap不是線程安全,使用hashmap時并按照引用傳遞時,要不保證只讀,要不就保證同時只有一個線程進行讀寫,前兩者都不能保證就必須自己加鎖做同步保護。
后來找到這個類的最新版本,發現在后面的版本中已經fix這個問題,有興趣的話可以打開下面的URL看版本對比,sun官方的fix方式和我上面的完全一致,呵呵。
http://fisheye5.cenqua.com/browse/jax-ws-sources/jaxws-ri/rt/src/com/sun/xml/ws/transport/http/client/HttpTransportPipe.java?r1=1.14&r2=1.15&u=3&k=kv
還有一個關聯的issue,https://jax-ws.dev.java.net/issues/show_bug.cgi?id=467,看了一下內容,和我們的場景完全不一樣,看來修改這個地方純屬巧合。
說點題外話,算是牢騷吧:
有點懷疑metro是不是根本就沒有做過性能測試,我們的測試場景,openESB下通過bepl調用4個我們稱為common service的webservice,目前大概做到1200個tps,算下來common service的webservice的tps大概是1200*4 = 5K附近,上面的問題就非常明顯,之前tps沒有上去前沒有這么嚴重。
可以參考我之前的一個blog, http://www.aygfsteel.com/aoxj/archive/2010/04/29/319706.html,在解決這里提到的http long connection 和 TIME_AIT的問題之前,我們的tps比較低,cpu壓不上去,當時好像這個問題不明顯。后來搞定之后tps上來了才暴露出來。
考慮上一個blog中 == 比較無效導致cache失效的bug,我對metro的代碼質量真是很沒有信心。按說這樣的大型項目,release之前怎么也要做做壓力測試,穩定性測試之類,很容易發現類似問題才是。我相信,不是每個用metro的地方,tps都只需要跑幾十tps而已吧。我在我的普通開發機上做測試,大概只能跑到100個tps,沒有發現出錯。換到比較強勁的機器,tps上到1000后,上面的錯誤立即凸現。
posted on 2010-05-05 21:18 sky ao 閱讀(2877) 評論(3) 編輯 收藏 所屬分類: 雜談